From 3a8ad33dde2f059b82ebf09f5cffa66c86f2e734 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 13 Aug 2010 02:20:40 -0700
Subject: [PATCH] i965: Add a pass for the FS to reduce vector expressions down
 to scalar.

This is a step towards implementing a GLSL IR backend for the 965
fragment shader.  Because it has downsides with the current codegen,
it is hidden under the environment variable INTEL_NEW_FS.

This results in an increase in instruction count at the moment (1444
-> 1752 for glsl-fs-raytrace, 345 -> 359 on my demo), because dot
products are turned into a series of multiplies and adds instead of a
custom expansion of MULs and MACs, and by not splitting the variable
types up we don't get tree grafting and thus there are extra moves of
temporary storage.  However, register count drops for the non-GLSL
path (64 -> 56 on my demo shader) because the register allocator sees
all the sub-operations.
---
 src/mesa/drivers/dri/i965/Makefile            |   3 +-
 src/mesa/drivers/dri/i965/brw_fs.cpp          |  15 +
 .../dri/i965/brw_fs_channel_expressions.cpp   | 365 ++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_wm.h            |   2 +
 4 files changed, 384 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp

diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index bc4cfab5c02..39acae9e434 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -105,7 +105,8 @@ C_SOURCES = \
 	$(DRIVER_SOURCES)
 
 CXX_SOURCES = \
-	brw_fs.cpp
+	brw_fs.cpp \
+	brw_fs_channel_expressions.cpp
 
 ASM_SOURCES = 
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 9509d932367..d16e75a2ca9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -36,6 +36,7 @@ extern "C" {
 #include "brw_wm.h"
 #include "talloc.h"
 }
+#include "../glsl/ir_optimization.h"
 
 struct gl_shader *
 brw_new_shader(GLcontext *ctx, GLuint name, GLuint type)
@@ -75,6 +76,20 @@ brw_compile_shader(GLcontext *ctx, struct gl_shader *shader)
 GLboolean
 brw_link_shader(GLcontext *ctx, struct gl_shader_program *prog)
 {
+   static int using_new_fs = -1;
+
+   if (using_new_fs == -1)
+      using_new_fs = getenv("INTEL_NEW_FS") != NULL;
+
+   for (unsigned i = 0; i < prog->_NumLinkedShaders; i++) {
+      struct gl_shader *shader = prog->_LinkedShaders[i];
+
+      if (using_new_fs && shader->Type == GL_FRAGMENT_SHADER) {
+	 do_mat_op_to_vec(shader->ir);
+	 brw_do_channel_expressions(shader->ir);
+      }
+   }
+
    if (!_mesa_ir_link_shader(ctx, prog))
       return GL_FALSE;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
new file mode 100644
index 00000000000..d8d58a9467b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -0,0 +1,365 @@
+/*
+ * Copyright Â© 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_wm_channel_expressions.cpp
+ *
+ * Breaks vector operations down into operations on each component.
+ *
+ * The 965 fragment shader receives 8 or 16 pixels at a time, so each
+ * channel of a vector is laid out as 1 or 2 8-float registers.  Each
+ * ALU operation operates on one of those channel registers.  As a
+ * result, there is no value to the 965 fragment shader in tracking
+ * "vector" expressions in the sense of GLSL fragment shaders, when
+ * doing a channel at a time may help in constant folding, algebraic
+ * simplification, and reducing the liveness of channel registers.
+ *
+ * The exception to the desire to break everything down to floats is
+ * texturing.  The texture sampler returns a writemasked masked
+ * 4/8-register sequence containing the texture values.  We don't want
+ * to dispatch to the sampler separately for each channel we need, so
+ * we do retain the vector types in that case.
+ */
+
+extern "C" {
+#include "main/core.h"
+#include "brw_wm.h"
+}
+#include "../glsl/ir.h"
+#include "../glsl/ir_expression_flattening.h"
+#include "../glsl/glsl_types.h"
+
+class ir_channel_expressions_visitor : public ir_hierarchical_visitor {
+public:
+   ir_channel_expressions_visitor()
+   {
+      this->progress = false;
+      this->mem_ctx = NULL;
+   }
+
+   ir_visitor_status visit_leave(ir_assignment *);
+
+   ir_rvalue *get_element(ir_variable *var, unsigned int element);
+   void assign(ir_assignment *ir, int elem, ir_rvalue *val);
+
+   bool progress;
+   void *mem_ctx;
+};
+
+static bool
+channel_expressions_predicate(ir_instruction *ir)
+{
+   ir_expression *expr = ir->as_expression();
+   unsigned int i;
+
+   if (!expr)
+      return false;
+
+   for (i = 0; i < expr->get_num_operands(); i++) {
+      if (expr->operands[i]->type->is_vector())
+	 return true;
+   }
+
+   return false;
+}
+
+extern "C" {
+GLboolean
+brw_do_channel_expressions(exec_list *instructions)
+{
+   ir_channel_expressions_visitor v;
+
+   /* Pull out any matrix expression to a separate assignment to a
+    * temp.  This will make our handling of the breakdown to
+    * operations on the matrix's vector components much easier.
+    */
+   do_expression_flattening(instructions, channel_expressions_predicate);
+
+   visit_list_elements(&v, instructions);
+
+   return v.progress;
+}
+}
+
+ir_rvalue *
+ir_channel_expressions_visitor::get_element(ir_variable *var, unsigned int elem)
+{
+   ir_dereference *deref;
+
+   if (var->type->is_scalar())
+      return new(mem_ctx) ir_dereference_variable(var);
+
+   assert(elem < var->type->components());
+   deref = new(mem_ctx) ir_dereference_variable(var);
+   return new(mem_ctx) ir_swizzle(deref, elem, 0, 0, 0, 1);
+}
+
+void
+ir_channel_expressions_visitor::assign(ir_assignment *ir, int elem, ir_rvalue *val)
+{
+   ir_dereference *lhs = ir->lhs->clone(mem_ctx, NULL);
+   ir_assignment *assign;
+   ir_swizzle *val_swiz;
+
+   /* This assign-of-expression should have been generated by the
+    * expression flattening visitor (since we never short circit to
+    * not flatten, even for plain assignments of variables), so the
+    * writemask is always full.
+    */
+   assert(ir->write_mask == (1 << ir->lhs->type->components()) - 1);
+
+   /* Smear the float across all the channels for the masked write. */
+   val_swiz = new(mem_ctx) ir_swizzle(val, 0, 0, 0, 0,
+				      ir->lhs->type->components());
+   assign = new(mem_ctx) ir_assignment(lhs, val_swiz, NULL, (1 << elem));
+   ir->insert_before(assign);
+}
+
+ir_visitor_status
+ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
+{
+   ir_expression *expr = ir->rhs->as_expression();
+   bool found_vector = false;
+   unsigned int i, vector_elements = 1;
+   ir_variable *op_var[2];
+
+   if (!expr)
+      return visit_continue;
+
+   if (!this->mem_ctx)
+      this->mem_ctx = talloc_parent(ir);
+
+   for (i = 0; i < expr->get_num_operands(); i++) {
+      if (expr->operands[i]->type->is_vector()) {
+	 found_vector = true;
+	 vector_elements = expr->operands[i]->type->vector_elements;
+	 break;
+      }
+   }
+   if (!found_vector)
+      return visit_continue;
+
+   /* Store the expression operands in temps so we can use them
+    * multiple times.
+    */
+   for (i = 0; i < expr->get_num_operands(); i++) {
+      ir_assignment *assign;
+      ir_dereference *deref;
+
+      assert(!expr->operands[i]->type->is_matrix());
+
+      op_var[i] = new(mem_ctx) ir_variable(expr->operands[i]->type,
+					   "channel_expressions",
+					   ir_var_temporary);
+      ir->insert_before(op_var[i]);
+
+      deref = new(mem_ctx) ir_dereference_variable(op_var[i]);
+      assign = new(mem_ctx) ir_assignment(deref,
+					  expr->operands[i],
+					  NULL);
+      ir->insert_before(assign);
+   }
+
+   const glsl_type *element_type = glsl_type::get_instance(ir->lhs->type->base_type,
+							   1, 1);
+
+   /* OK, time to break down this vector operation. */
+   switch (expr->operation) {
+   case ir_unop_bit_not:
+   case ir_unop_logic_not:
+   case ir_unop_neg:
+   case ir_unop_abs:
+   case ir_unop_sign:
+   case ir_unop_rcp:
+   case ir_unop_rsq:
+   case ir_unop_sqrt:
+   case ir_unop_exp:
+   case ir_unop_log:
+   case ir_unop_exp2:
+   case ir_unop_log2:
+   case ir_unop_f2i:
+   case ir_unop_i2f:
+   case ir_unop_f2b:
+   case ir_unop_b2f:
+   case ir_unop_i2b:
+   case ir_unop_b2i:
+   case ir_unop_u2f:
+   case ir_unop_trunc:
+   case ir_unop_ceil:
+   case ir_unop_floor:
+   case ir_unop_fract:
+   case ir_unop_sin:
+   case ir_unop_cos:
+   case ir_unop_dFdx:
+   case ir_unop_dFdy:
+      for (i = 0; i < vector_elements; i++) {
+	 ir_rvalue *op0 = get_element(op_var[0], i);
+
+	 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
+						  element_type,
+						  op0,
+						  NULL));
+      }
+      break;
+
+   case ir_binop_add:
+   case ir_binop_sub:
+   case ir_binop_mul:
+   case ir_binop_div:
+   case ir_binop_mod:
+   case ir_binop_min:
+   case ir_binop_max:
+   case ir_binop_pow:
+   case ir_binop_lshift:
+   case ir_binop_rshift:
+   case ir_binop_bit_and:
+   case ir_binop_bit_xor:
+   case ir_binop_bit_or:
+      for (i = 0; i < vector_elements; i++) {
+	 ir_rvalue *op0 = get_element(op_var[0], i);
+	 ir_rvalue *op1 = get_element(op_var[1], i);
+
+	 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
+						  element_type,
+						  op0,
+						  op1));
+      }
+      break;
+
+   case ir_unop_any: {
+      ir_expression *temp;
+      temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
+					element_type,
+					get_element(op_var[0], 0),
+					get_element(op_var[0], 1));
+
+      for (i = 2; i < vector_elements; i++) {
+	 temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
+					   element_type,
+					   get_element(op_var[0], i),
+					   temp);
+      }
+      assign(ir, 0, temp);
+      break;
+   }
+
+   case ir_binop_dot: {
+      ir_expression *last = NULL;
+      for (i = 0; i < vector_elements; i++) {
+	 ir_rvalue *op0 = get_element(op_var[0], i);
+	 ir_rvalue *op1 = get_element(op_var[1], i);
+	 ir_expression *temp;
+
+	 temp = new(mem_ctx) ir_expression(ir_binop_mul,
+					   element_type,
+					   op0,
+					   op1);
+	 if (last) {
+	    last = new(mem_ctx) ir_expression(ir_binop_add,
+					      element_type,
+					      temp,
+					      last);
+	 } else {
+	    last = temp;
+	 }
+      }
+      assign(ir, 0, last);
+      break;
+   }
+
+   case ir_binop_cross: {
+      for (i = 0; i < vector_elements; i++) {
+	 int swiz0 = (i + 1) % 3;
+	 int swiz1 = (i + 2) % 3;
+	 ir_expression *temp1, *temp2;
+
+	 temp1 = new(mem_ctx) ir_expression(ir_binop_mul,
+					    element_type,
+					    get_element(op_var[0], swiz0),
+					    get_element(op_var[1], swiz1));
+
+	 temp2 = new(mem_ctx) ir_expression(ir_binop_mul,
+					    element_type,
+					    get_element(op_var[1], swiz0),
+					    get_element(op_var[0], swiz1));
+
+	 temp2 = new(mem_ctx) ir_expression(ir_unop_neg,
+					    element_type,
+					    temp2,
+					    NULL);
+
+	 assign(ir, i, new(mem_ctx) ir_expression(ir_binop_add,
+						  element_type,
+						  temp1, temp2));
+      }
+      break;
+   }
+
+   case ir_binop_less:
+   case ir_binop_greater:
+   case ir_binop_lequal:
+   case ir_binop_gequal:
+   case ir_binop_logic_and:
+   case ir_binop_logic_xor:
+   case ir_binop_logic_or:
+      ir->print();
+      printf("\n");
+      assert(!"not reached: expression operates on scalars only");
+      break;
+   case ir_binop_equal:
+   case ir_binop_nequal: {
+      ir_expression *last = NULL;
+      for (i = 0; i < vector_elements; i++) {
+	 ir_rvalue *op0 = get_element(op_var[0], i);
+	 ir_rvalue *op1 = get_element(op_var[1], i);
+	 ir_expression *temp;
+	 ir_expression_operation join;
+
+	 if (expr->operation == ir_binop_equal)
+	    join = ir_binop_logic_and;
+	 else
+	    join = ir_binop_logic_or;
+
+	 temp = new(mem_ctx) ir_expression(expr->operation,
+					   element_type,
+					   op0,
+					   op1);
+	 if (last) {
+	    last = new(mem_ctx) ir_expression(join,
+					      element_type,
+					      temp,
+					      last);
+	 } else {
+	    last = temp;
+	 }
+      }
+      assign(ir, 0, last);
+      break;
+   }
+   }
+
+   ir->remove();
+   this->progress = true;
+
+   return visit_continue;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 25a72f5dda9..438da1af622 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -465,4 +465,6 @@ GLboolean brw_link_shader(GLcontext *ctx, struct gl_shader_program *prog);
 struct gl_shader *brw_new_shader(GLcontext *ctx, GLuint name, GLuint type);
 struct gl_shader_program *brw_new_shader_program(GLcontext *ctx, GLuint name);
 
+GLboolean brw_do_channel_expressions(struct exec_list *instructions);
+
 #endif
-- 
2.30.2