AC_SUBST([OPENCL_VERSION])
dnl Versions for external dependencies
- LIBDRM_REQUIRED=2.4.60
+ LIBDRM_REQUIRED=2.4.66
LIBDRM_RADEON_REQUIRED=2.4.56
LIBDRM_AMDGPU_REQUIRED=2.4.63
LIBDRM_INTEL_REQUIRED=2.4.61
AC_SUBST([GBM_PC_REQ_PRIV])
AC_SUBST([GBM_PC_LIB_PRIV])
+AM_CONDITIONAL(HAVE_VULKAN, true)
+
dnl
dnl EGL configuration
dnl
if test "x$enable_vdpau" = xyes; then
PKG_CHECK_MODULES([VDPAU], [vdpau >= $VDPAU_REQUIRED])
gallium_st="$gallium_st vdpau"
+ DEFINES="$DEFINES -DHAVE_ST_VDPAU"
fi
AM_CONDITIONAL(HAVE_ST_VDPAU, test "x$enable_vdpau" = xyes)
fi
}
+ swr_llvm_check() {
+ gallium_require_llvm $1
+ if test ${LLVM_VERSION_INT} -lt 306; then
+ AC_MSG_ERROR([LLVM version 3.6 or later required when building $1])
+ fi
+ if test "x$enable_gallium_llvm" != "xyes"; then
+ AC_MSG_ERROR([--enable-gallium-llvm is required when building $1])
+ fi
+ }
+
dnl Duplicates in GALLIUM_DRIVERS_DIRS are removed by sorting it after this block
if test -n "$with_gallium_drivers"; then
gallium_drivers=`IFS=', '; echo $with_gallium_drivers`
HAVE_GALLIUM_LLVMPIPE=yes
fi
;;
+ xswr)
+ AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory])
+ swr_llvm_check "swr"
+
+ AC_MSG_CHECKING([whether $CXX supports AVX/AVX2])
+ AVX_CXXFLAGS="-march=core-avx-i"
+ AVX2_CXXFLAGS="-march=core-avx2"
+
+ AC_LANG_PUSH([C++])
+ save_CXXFLAGS="$CXXFLAGS"
+ CXXFLAGS="$AVX_CXXFLAGS $CXXFLAGS"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[],
+ [AC_MSG_ERROR([AVX compiler support not detected])])
+ CXXFLAGS="$save_CXXFLAGS"
+
+ save_CFLAGS="$CXXFLAGS"
+ CXXFLAGS="$AVX2_CXXFLAGS $CXXFLAGS"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[],
+ [AC_MSG_ERROR([AVX2 compiler support not detected])])
+ CXXFLAGS="$save_CXXFLAGS"
+ AC_LANG_POP([C++])
+
+ HAVE_GALLIUM_SWR=yes
+ ;;
xvc4)
HAVE_GALLIUM_VC4=yes
gallium_require_drm "vc4"
AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes)
AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes)
AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes)
+ AM_CONDITIONAL(HAVE_GALLIUM_SWR, test "x$HAVE_GALLIUM_SWR" = xyes)
AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes)
AM_CONDITIONAL(HAVE_GALLIUM_VIRGL, test "x$HAVE_GALLIUM_VIRGL" = xyes)
AC_SUBST([XA_TINY], $XA_TINY)
AC_SUBST([XA_VERSION], "$XA_MAJOR.$XA_MINOR.$XA_TINY")
+PKG_CHECK_MODULES(VALGRIND, [valgrind],
+ [have_valgrind=yes], [have_valgrind=no])
+if test "x$have_valgrind" = "xyes"; then
+ AC_DEFINE([HAVE_VALGRIND], 1,
+ [Use valgrind intrinsics to suppress false warnings])
+fi
+
dnl Restore LDFLAGS and CPPFLAGS
LDFLAGS="$_SAVE_LDFLAGS"
CPPFLAGS="$_SAVE_CPPFLAGS"
src/gallium/drivers/rbug/Makefile
src/gallium/drivers/softpipe/Makefile
src/gallium/drivers/svga/Makefile
+ src/gallium/drivers/swr/Makefile
+ src/gallium/drivers/swr/avx/Makefile
+ src/gallium/drivers/swr/avx2/Makefile
src/gallium/drivers/trace/Makefile
src/gallium/drivers/vc4/Makefile
src/gallium/drivers/virgl/Makefile
src/glx/apple/Makefile
src/glx/tests/Makefile
src/gtest/Makefile
+ src/intel/Makefile
+ src/intel/genxml/Makefile
+ src/intel/isl/Makefile
+ src/intel/vulkan/Makefile
+ src/intel/vulkan/tests/Makefile
src/loader/Makefile
src/mapi/Makefile
src/mapi/es1api/glesv1_cm.pc
nir/nir_control_flow_private.h \
nir/nir_dominance.c \
nir/nir_from_ssa.c \
+ nir/nir_gather_info.c \
nir/nir_gs_count_vertices.c \
+ nir/nir_inline_functions.c \
nir/nir_intrinsics.c \
nir/nir_intrinsics.h \
nir/nir_instr_set.c \
nir/nir_lower_clip.c \
nir/nir_lower_global_vars_to_local.c \
nir/nir_lower_gs_intrinsics.c \
- nir/nir_lower_indirect_derefs.c \
nir/nir_lower_load_const_to_scalar.c \
nir/nir_lower_locals_to_regs.c \
nir/nir_lower_idiv.c \
+ nir/nir_lower_indirect_derefs.c \
nir/nir_lower_io.c \
nir/nir_lower_outputs_to_temporaries.c \
nir/nir_lower_phis_to_scalar.c \
+ nir/nir_lower_returns.c \
nir/nir_lower_samplers.c \
nir/nir_lower_system_values.c \
nir/nir_lower_tex.c \
nir/nir_opt_peephole_select.c \
nir/nir_opt_remove_phis.c \
nir/nir_opt_undef.c \
+ nir/nir_phi_builder.c \
+ nir/nir_phi_builder.h \
nir/nir_print.c \
nir/nir_remove_dead_variables.c \
+ nir/nir_repair_ssa.c \
nir/nir_search.c \
nir/nir_search.h \
nir/nir_split_var_copies.c \
nir/nir_vla.h \
nir/nir_worklist.c \
nir/nir_worklist.h
+
+SPIRV_FILES = \
+ nir/spirv/nir_spirv.h \
+ nir/spirv/spirv_to_nir.c \
+ nir/spirv/vtn_alu.c \
+ nir/spirv/vtn_cfg.c \
+ nir/spirv/vtn_glsl450.c \
+ nir/spirv/vtn_private.h \
+ nir/spirv/vtn_variables.c
this->extensions = &ctx->Extensions;
+ this->ARB_compute_shader_enable = true;
+
this->Const.MaxLights = ctx->Const.MaxLights;
this->Const.MaxClipPlanes = ctx->Const.MaxClipPlanes;
this->Const.MaxTextureUnits = ctx->Const.MaxTextureUnits;
EXT(ARB_gpu_shader_fp64, true, false, ARB_gpu_shader_fp64),
EXT(ARB_sample_shading, true, false, ARB_sample_shading),
EXT(ARB_separate_shader_objects, true, false, dummy_true),
+ EXT(ARB_shader_atomic_counter_ops, true, false, ARB_shader_atomic_counter_ops),
EXT(ARB_shader_atomic_counters, true, false, ARB_shader_atomic_counters),
EXT(ARB_shader_bit_encoding, true, false, ARB_shader_bit_encoding),
EXT(ARB_shader_clock, true, false, ARB_shader_clock),
block->layout.flags.i |= block_interface_qualifier;
if (state->stage == MESA_SHADER_GEOMETRY &&
- state->has_explicit_attrib_stream()) {
+ state->has_explicit_attrib_stream() &&
+ block->layout.flags.q.out) {
/* Assign global layout's stream value. */
block->layout.flags.q.stream = 1;
block->layout.flags.q.explicit_stream = 0;
nir_control_flow_private.h \
nir_dominance.c \
nir_from_ssa.c \
+ nir_gather_info.c \
nir_gs_count_vertices.c \
+ nir_inline_functions.c \
nir_intrinsics.c \
nir_intrinsics.h \
nir_instr_set.c \
nir_lower_clip.c \
nir_lower_global_vars_to_local.c \
nir_lower_gs_intrinsics.c \
- nir_lower_indirect_derefs.c \
nir_lower_load_const_to_scalar.c \
nir_lower_locals_to_regs.c \
nir_lower_idiv.c \
+ nir_lower_indirect_derefs.c \
nir_lower_io.c \
nir_lower_outputs_to_temporaries.c \
nir_lower_phis_to_scalar.c \
+ nir_lower_returns.c \
nir_lower_samplers.c \
nir_lower_system_values.c \
nir_lower_tex.c \
nir_opt_peephole_select.c \
nir_opt_remove_phis.c \
nir_opt_undef.c \
+ nir_phi_builder.c \
+ nir_phi_builder.h \
nir_print.c \
nir_remove_dead_variables.c \
+ nir_repair_ssa.c \
nir_search.c \
nir_search.h \
nir_split_var_copies.c \
nir_worklist.c \
nir_worklist.h
+SPIRV_FILES = \
+ spirv/nir_spirv.h \
+ spirv/spirv_to_nir.c \
+ spirv/vtn_alu.c \
+ spirv/vtn_cfg.c \
+ spirv/vtn_glsl450.c \
+ spirv/vtn_private.h \
+ spirv/vtn_variables.c
+
class nir_visitor : public ir_visitor
{
public:
- nir_visitor(nir_shader *shader);
+ nir_visitor(nir_shader *shader, gl_shader *sh);
~nir_visitor();
virtual void visit(ir_variable *);
bool supports_ints;
+ struct gl_shader *sh;
+
nir_shader *shader;
nir_function_impl *impl;
nir_builder b;
nir_shader *shader = nir_shader_create(NULL, stage, options);
- nir_visitor v1(shader);
+ nir_visitor v1(shader, sh);
nir_function_visitor v2(&v1);
v2.run(sh->ir);
visit_exec_list(sh->ir, &v1);
- nir_lower_outputs_to_temporaries(shader);
+ nir_function *main = NULL;
+ nir_foreach_function(shader, func) {
+ if (strcmp(func->name, "main") == 0) {
+ main = func;
+ break;
+ }
+ }
+ assert(main);
+
+ nir_lower_outputs_to_temporaries(shader, main);
shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
if (shader_prog->Label)
return shader;
}
-nir_visitor::nir_visitor(nir_shader *shader)
+nir_visitor::nir_visitor(nir_shader *shader, gl_shader *sh)
{
this->supports_ints = shader->options->native_integers;
this->shader = shader;
+ this->sh = sh;
this->is_global = true;
this->var_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
_mesa_key_pointer_equal);
}
var->data.index = ir->data.index;
+ var->data.descriptor_set = 0;
var->data.binding = ir->data.binding;
var->data.offset = ir->data.offset;
var->data.image.read_only = ir->data.image_read_only;
nir_function *func = nir_function_create(shader, ir->function_name());
- unsigned num_params = ir->parameters.length();
- func->num_params = num_params;
- func->params = ralloc_array(shader, nir_parameter, num_params);
-
- unsigned i = 0;
- foreach_in_list(ir_variable, param, &ir->parameters) {
- switch (param->data.mode) {
- case ir_var_function_in:
- func->params[i].param_type = nir_parameter_in;
- break;
-
- case ir_var_function_out:
- func->params[i].param_type = nir_parameter_out;
- break;
-
- case ir_var_function_inout:
- func->params[i].param_type = nir_parameter_inout;
- break;
-
- default:
- unreachable("not reached");
- }
-
- func->params[i].type = param->type;
- i++;
- }
-
- func->return_type = ir->return_type;
+ assert(ir->parameters.is_empty());
+ assert(ir->return_type == glsl_type::void_type);
_mesa_hash_table_insert(this->overload_table, ir, func);
}
nir_function_impl *impl = nir_function_impl_create(func);
this->impl = impl;
- unsigned num_params = func->num_params;
- impl->num_params = num_params;
- impl->params = ralloc_array(this->shader, nir_variable *, num_params);
- unsigned i = 0;
- foreach_in_list(ir_variable, param, &ir->parameters) {
- param->accept(this);
- impl->params[i] = this->var;
- i++;
- }
-
- if (func->return_type == glsl_type::void_type) {
- impl->return_var = NULL;
- } else {
- impl->return_var = ralloc(this->shader, nir_variable);
- impl->return_var->name = ralloc_strdup(impl->return_var,
- "return_var");
- impl->return_var->type = func->return_type;
- }
+ assert(strcmp(func->name, "main") == 0);
+ assert(ir->parameters.is_empty());
+ assert(func->return_type == glsl_type::void_type);
this->is_global = false;
exec_list_make_empty(&shader->uniforms);
exec_list_make_empty(&shader->inputs);
exec_list_make_empty(&shader->outputs);
+ exec_list_make_empty(&shader->shared);
shader->options = options;
memset(&shader->info, 0, sizeof(shader->info));
shader->num_inputs = 0;
shader->num_outputs = 0;
shader->num_uniforms = 0;
+ shader->num_shared = 0;
shader->stage = stage;
exec_list_push_tail(&shader->uniforms, &var->node);
break;
+ case nir_var_shared:
+ assert(shader->stage == MESA_SHADER_COMPUTE);
+ exec_list_push_tail(&shader->shared, &var->node);
+ break;
+
case nir_var_system_value:
exec_list_push_tail(&shader->system_values, &var->node);
break;
impl->return_var->type = function->return_type;
impl->return_var->data.mode = nir_var_param;
impl->return_var->data.location = -1;
+ } else {
+ impl->return_var = NULL;
}
return impl;
return nir_cf_node_as_function(node);
}
+/* Reduces a cursor by trying to convert everything to after and trying to
+ * go up to block granularity when possible.
+ */
+static nir_cursor
+reduce_cursor(nir_cursor cursor)
+{
+ switch (cursor.option) {
+ case nir_cursor_before_block:
+ if (exec_list_is_empty(&cursor.block->instr_list)) {
+ /* Empty block. After is as good as before. */
+ cursor.option = nir_cursor_after_block;
+ } else {
+ /* Try to switch to after the previous block if there is one.
+ * (This isn't likely, but it can happen.)
+ */
+ nir_cf_node *prev_node = nir_cf_node_prev(&cursor.block->cf_node);
+ if (prev_node && prev_node->type == nir_cf_node_block) {
+ cursor.block = nir_cf_node_as_block(prev_node);
+ cursor.option = nir_cursor_after_block;
+ }
+ }
+ return cursor;
+
+ case nir_cursor_after_block:
+ return cursor;
+
+ case nir_cursor_before_instr: {
+ nir_instr *prev_instr = nir_instr_prev(cursor.instr);
+ if (prev_instr) {
+ /* Before this instruction is after the previous */
+ cursor.instr = prev_instr;
+ cursor.option = nir_cursor_after_instr;
+ } else {
+ /* No previous instruction. Switch to before block */
+ cursor.block = cursor.instr->block;
+ cursor.option = nir_cursor_before_block;
+ }
+ return reduce_cursor(cursor);
+ }
+
+ case nir_cursor_after_instr:
+ if (nir_instr_next(cursor.instr) == NULL) {
+ /* This is the last instruction, switch to after block */
+ cursor.option = nir_cursor_after_block;
+ cursor.block = cursor.instr->block;
+ }
+ return cursor;
+
+ default:
+ unreachable("Inavlid cursor option");
+ }
+}
+
+bool
+nir_cursors_equal(nir_cursor a, nir_cursor b)
+{
+ /* Reduced cursors should be unique */
+ a = reduce_cursor(a);
+ b = reduce_cursor(b);
+
+ return a.block == b.block && a.option == b.option;
+}
+
static bool
add_use_cb(nir_src *src, void *state)
{
nir_var_local,
nir_var_uniform,
nir_var_shader_storage,
+ nir_var_shared,
nir_var_system_value,
nir_var_param,
} nir_variable_mode;
*
* \sa nir_variable_mode
*/
- nir_variable_mode mode:4;
+ nir_variable_mode mode:5;
/**
* Interpolation mode for shader inputs / outputs
*/
int index;
+ /**
+ * Descriptor set binding for sampler or UBO.
+ */
+ int descriptor_set;
+
/**
* Initial binding point for a sampler or UBO.
*
#define nir_foreach_variable(var, var_list) \
foreach_list_typed(nir_variable, var, node, var_list)
+#define nir_foreach_variable_safe(var, var_list) \
+ foreach_list_typed_safe(nir_variable, var, node, var_list)
+
static inline bool
nir_variable_is_global(const nir_variable *var)
{
return var->data.mode != nir_var_local && var->data.mode != nir_var_param;
}
+/**
+ * Returns the bits in the inputs_read, outputs_written, or
+ * system_values_read bitfield corresponding to this variable.
+ */
+static inline uint64_t
+nir_variable_get_io_mask(nir_variable *var, gl_shader_stage stage)
+{
+ assert(var->data.mode == nir_var_shader_in ||
+ var->data.mode == nir_var_shader_out ||
+ var->data.mode == nir_var_system_value);
+ assert(var->data.location >= 0);
+
+ const struct glsl_type *var_type = var->type;
+ if (stage == MESA_SHADER_GEOMETRY && var->data.mode == nir_var_shader_in) {
+ /* Most geometry shader inputs are per-vertex arrays */
+ if (var->data.location >= VARYING_SLOT_VAR0)
+ assert(glsl_type_is_array(var_type));
+
+ if (glsl_type_is_array(var_type))
+ var_type = glsl_get_array_element(var_type);
+ }
+
+ bool is_vertex_input = (var->data.mode == nir_var_shader_in &&
+ stage == MESA_SHADER_VERTEX);
+ unsigned slots = glsl_count_attribute_slots(var_type, is_vertex_input);
+ return ((1ull << slots) - 1) << var->data.location;
+}
+
typedef struct nir_register {
struct exec_node node;
*/
NIR_INTRINSIC_UCP_ID = 4,
+ /**
+ * The range of a load operation. This specifies the maximum amount of
+ * data starting at the base offset (if any) that can be accessed.
+ */
+ NIR_INTRINSIC_RANGE = 5,
+
+ /**
+ * The Vulkan descriptor set for vulkan_resource_index intrinsic.
+ */
+ NIR_INTRINSIC_DESC_SET = 6,
+
+ /**
+ * The Vulkan descriptor set binding for vulkan_resource_index intrinsic.
+ */
+ NIR_INTRINSIC_BINDING = 7,
+
NIR_INTRINSIC_NUM_INDEX_FLAGS,
} nir_intrinsic_index_flag;
INTRINSIC_IDX_ACCESSORS(base, BASE, int)
INTRINSIC_IDX_ACCESSORS(stream_id, STREAM_ID, unsigned)
INTRINSIC_IDX_ACCESSORS(ucp_id, UCP_ID, unsigned)
+INTRINSIC_IDX_ACCESSORS(range, RANGE, unsigned)
+INTRINSIC_IDX_ACCESSORS(desc_set, DESC_SET, unsigned)
+INTRINSIC_IDX_ACCESSORS(binding, BINDING, unsigned)
/**
* \group texture information
* are simulated by floats.)
*/
bool native_integers;
+
+ /* Indicates that the driver only has zero-based vertex id */
+ bool vertex_id_zero_based;
} nir_shader_compiler_options;
typedef struct nir_shader_info {
/** list of outputs (nir_variable) */
struct exec_list outputs;
+ /** list of shared compute variables (nir_variable) */
+ struct exec_list shared;
+
/** Set of driver-specific options for the shader.
*
* The memory for the options is expected to be kept in a single static
* the highest index a load_input_*, load_uniform_*, etc. intrinsic can
* access plus one
*/
- unsigned num_inputs, num_uniforms, num_outputs;
+ unsigned num_inputs, num_uniforms, num_outputs, num_shared;
/** The shader stage, such as MESA_SHADER_VERTEX. */
gl_shader_stage stage;
} nir_shader;
+static inline nir_function *
+nir_shader_get_entrypoint(nir_shader *shader)
+{
+ assert(exec_list_length(&shader->functions) == 1);
+ struct exec_node *func_node = exec_list_get_head(&shader->functions);
+ nir_function *func = exec_node_data(nir_function, func_node, node);
+ return func;
+}
+
#define nir_foreach_function(shader, func) \
foreach_list_typed(nir_function, func, node, &(shader)->functions)
};
} nir_cursor;
+static inline nir_block *
+nir_cursor_current_block(nir_cursor cursor)
+{
+ if (cursor.option == nir_cursor_before_instr ||
+ cursor.option == nir_cursor_after_instr) {
+ return cursor.instr->block;
+ } else {
+ return cursor.block;
+ }
+}
+
+bool nir_cursors_equal(nir_cursor a, nir_cursor b);
+
static inline nir_cursor
nir_before_block(nir_block *block)
{
return nir_before_block(nir_cf_node_as_block(nir_cf_node_next(node)));
}
+static inline nir_cursor
+nir_after_cf_node_and_phis(nir_cf_node *node)
+{
+ if (node->type == nir_cf_node_block)
+ return nir_after_block(nir_cf_node_as_block(node));
+
+ nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
+ assert(block->cf_node.type == nir_cf_node_block);
+
+ nir_foreach_instr(block, instr) {
+ if (instr->type != nir_instr_type_phi)
+ return nir_before_instr(instr);
+ }
+ return nir_after_block(block);
+}
+
static inline nir_cursor
nir_before_cf_list(struct exec_list *cf_list)
{
void nir_print_shader(nir_shader *shader, FILE *fp);
void nir_print_instr(const nir_instr *instr, FILE *fp);
- nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
- nir_function_impl *nir_function_impl_clone(const nir_function_impl *impl);
+ nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
+ nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
+nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
#ifdef DEBUG
void nir_validate_shader(nir_shader *shader);
bool nir_split_var_copies(nir_shader *shader);
+bool nir_lower_returns_impl(nir_function_impl *impl);
+bool nir_lower_returns(nir_shader *shader);
+
+bool nir_inline_functions(nir_shader *shader);
+
void nir_lower_var_copy_instr(nir_intrinsic_instr *copy, void *mem_ctx);
void nir_lower_var_copies(nir_shader *shader);
bool nir_lower_locals_to_regs(nir_shader *shader);
-void nir_lower_outputs_to_temporaries(nir_shader *shader);
+void nir_lower_outputs_to_temporaries(nir_shader *shader,
+ nir_function *entrypoint);
+
+void nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint);
void nir_assign_var_locations(struct exec_list *var_list,
unsigned *size,
void nir_lower_vars_to_ssa(nir_shader *shader);
-bool nir_remove_dead_variables(nir_shader *shader);
+bool nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode);
void nir_move_vec_src_uses_to_dest(nir_shader *shader);
bool nir_lower_vec_to_movs(nir_shader *shader);
void nir_convert_to_ssa_impl(nir_function_impl *impl);
void nir_convert_to_ssa(nir_shader *shader);
+
+bool nir_repair_ssa_impl(nir_function_impl *impl);
+bool nir_repair_ssa(nir_shader *shader);
/* If phi_webs_only is true, only convert SSA values involved in phi nodes to
* registers. If false, convert all values (even those not involved in a phi
#! /usr/bin/env python
+# -*- encoding: utf-8 -*-
#
# Copyright (C) 2014 Intel Corporation
#
(('imul', a, 1), a),
(('fmul', a, -1.0), ('fneg', a)),
(('imul', a, -1), ('ineg', a)),
+ (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
(('ffma', 0.0, a, b), b),
(('ffma', a, 0.0, b), b),
(('ffma', a, b, 0.0), ('fmul', a, b)),
(('ishr', a, 0), a),
(('ushr', 0, a), 0),
(('ushr', a, 0), a),
+ (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
+ (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
# Exponential/logarithmic identities
(('fexp2', ('flog2', a)), a), # 2^lg2(a) = a
(('flog2', ('fexp2', a)), a), # lg2(2^a) = a
(('f2i', ('ftrunc', a)), ('f2i', a)),
(('f2u', ('ftrunc', a)), ('f2u', a)),
+ # Byte extraction
+ (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
+ (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'),
+ (('iand', 0xff, ('ushr', a, 8)), ('extract_u8', a, 1), '!options->lower_extract_byte'),
+ (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+
+ # Word extraction
+ (('ushr', a, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
+ (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
+
# Subtracts
(('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
(('isub', a, ('isub', 0, b)), ('iadd', a, b)),
# Misc. lowering
(('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+ (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
(('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
(('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
+ (('ldexp', 'x', 'exp'),
+ ('fmul', 'x', ('ishl', ('imin', ('imax', ('iadd', 'exp', 0x7f), 0), 0xff), 23))),
(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
('bcsel', ('ilt', 31, 'bits'), 'insert',
const char *const patch = (var->data.patch) ? "patch " : "";
const char *const inv = (var->data.invariant) ? "invariant " : "";
const char *const mode[] = { "shader_in ", "shader_out ", "", "",
- "uniform ", "shader_storage", "system " };
+ "uniform ", "shader_storage ", "shared ",
+ "system "};
fprintf(fp, "%s%s%s%s%s%s ",
cent, samp, patch, inv, mode[var->data.mode],
fprintf(fp, "%s", get_var_name(var, state));
}
+ static void
+ print_arg(nir_variable *var, print_state *state)
+ {
+ FILE *fp = state->fp;
+ glsl_print_type(var->type, fp);
+ fprintf(fp, " %s", get_var_name(var, state));
+ }
+
static void
print_deref_var(nir_deref_var *deref, print_state *state)
{
[NIR_INTRINSIC_WRMASK] = "wrmask",
[NIR_INTRINSIC_STREAM_ID] = "stream-id",
[NIR_INTRINSIC_UCP_ID] = "ucp-id",
+ [NIR_INTRINSIC_RANGE] = "range",
+ [NIR_INTRINSIC_DESC_SET] = "desc-set",
+ [NIR_INTRINSIC_BINDING] = "binding",
};
for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) {
if (!info->index_map[idx])
if (i != 0)
fprintf(fp, ", ");
- print_var(impl->params[i], state);
+ print_arg(impl->params[i], state);
}
if (impl->return_var != NULL) {
if (impl->num_params != 0)
fprintf(fp, ", ");
fprintf(fp, "returning ");
- print_var(impl->return_var, state);
+ print_arg(impl->return_var, state);
}
fprintf(fp, "{\n");
+ for (unsigned i = 0; i < impl->num_params; i++) {
+ fprintf(fp, "\t");
+ print_var_decl(impl->params[i], state);
+ }
+
+ if (impl->return_var) {
+ fprintf(fp, "\t");
+ print_var_decl(impl->return_var, state);
+ }
+
nir_foreach_variable(var, &impl->locals) {
fprintf(fp, "\t");
print_var_decl(var, state);
fprintf(fp, "inputs: %u\n", shader->num_inputs);
fprintf(fp, "outputs: %u\n", shader->num_outputs);
fprintf(fp, "uniforms: %u\n", shader->num_uniforms);
+ fprintf(fp, "shared: %u\n", shader->num_shared);
nir_foreach_variable(var, &shader->uniforms) {
print_var_decl(var, &state);
print_var_decl(var, &state);
}
+ nir_foreach_variable(var, &shader->shared) {
+ print_var_decl(var, &state);
+ }
+
nir_foreach_variable(var, &shader->globals) {
print_var_decl(var, &state);
}
assert(impl->num_params == impl->function->num_params);
for (unsigned i = 0; i < impl->num_params; i++) {
assert(impl->params[i]->type == impl->function->params[i].type);
+ assert(impl->params[i]->data.mode == nir_var_param);
assert(impl->params[i]->data.location == i);
validate_var_decl(impl->params[i], false, state);
}
assert(impl->return_var == NULL);
} else {
assert(impl->return_var->type == impl->function->return_type);
+ assert(impl->return_var->data.mode == nir_var_param);
assert(impl->return_var->data.location == -1);
validate_var_decl(impl->return_var, false, state);
}
validate_var_decl(var, true, &state);
}
+ exec_list_validate(&shader->shared);
+ nir_foreach_variable(var, &shader->shared) {
+ validate_var_decl(var, true, &state);
+ }
+
exec_list_validate(&shader->globals);
nir_foreach_variable(var, &shader->globals) {
validate_var_decl(var, true, &state);
brw_shader.cpp \
brw_shader.h \
brw_surface_formats.c \
+ brw_surface_formats.h \
brw_util.c \
brw_util.h \
brw_vec4_builder.h \
brw_ff_gs.h \
brw_fs_channel_expressions.cpp \
brw_fs_vector_splitting.cpp \
+ brw_formatquery.c \
brw_gs.c \
brw_gs.h \
brw_gs_state.c \
.max_ds_threads = 80,
.max_gs_threads = 80,
.max_wm_threads = 128,
- .max_cs_threads = 28,
+ .max_cs_threads = 6 * 7,
.urb = {
.size = 192,
.min_vs_entries = 34,
return devinfo;
}
+
+const char *
+brw_get_device_name(int devid)
+{
+ switch (devid) {
+#undef CHIPSET
+#define CHIPSET(id, family, name) case id: return name;
+#include "pci_ids/i965_pci_ids.h"
+ default:
+ return NULL;
+ }
+}
#pragma once
#include <stdbool.h>
+ /**
+ * Intel hardware information and quirks
+ */
struct brw_device_info
{
int gen; /**< Generation number: 4, 5, 6, 7, ... */
bool has_resource_streamer;
/**
- * Quirks:
+ * \name Intel hardware quirks
* @{
*/
bool has_negative_rhw_bug;
/** @} */
/**
- * GPU Limits:
+ * \name GPU hardware limits
+ *
+ * In general, you can find shader thread maximums by looking at the "Maximum
+ * Number of Threads" field in the Intel PRM description of the 3DSTATE_VS,
+ * 3DSTATE_GS, 3DSTATE_HS, 3DSTATE_DS, and 3DSTATE_PS commands. URB entry
+ * limits come from the "Number of URB Entries" field in the the
+ * 3DSTATE_URB_VS command and friends.
+ *
+ * These fields are used to calculate the scratch space to allocate. The
+ * amount of scratch space can be larger without being harmful on modern
+ * GPUs, however, prior to Haswell, programming the maximum number of threads
+ * to greater than the hardware maximum would cause GPU performance to tank.
+ *
* @{
*/
/**
* Total number of slices present on the device whether or not they've been
* fused off.
+ *
+ * XXX: CS thread counts are limited by the inability to do cross subslice
+ * communication. It is the effectively the number of logical threads which
+ * can be executed in a subslice. Fuse configurations may cause this number
+ * to change, so we program @max_cs_threads as the lower maximum.
*/
unsigned num_slices;
- unsigned max_vs_threads;
- unsigned max_hs_threads;
- unsigned max_ds_threads;
- unsigned max_gs_threads;
+ unsigned max_vs_threads; /**< Maximum Vertex Shader threads */
+ unsigned max_hs_threads; /**< Maximum Hull Shader threads */
+ unsigned max_ds_threads; /**< Maximum Domain Shader threads */
+ unsigned max_gs_threads; /**< Maximum Geometry Shader threads. */
+ /**
+ * Theoretical maximum number of Pixel Shader threads.
+ *
+ * PSD means Pixel Shader Dispatcher. On modern Intel GPUs, hardware will
+ * automatically scale pixel shader thread count, based on a single value
+ * programmed into 3DSTATE_PS.
+ *
+ * To calculate the maximum number of threads for Gen8 beyond (which have
+ * multiple Pixel Shader Dispatchers):
+ *
+ * - Look up 3DSTATE_PS and find "Maximum Number of Threads Per PSD"
+ * - Usually there's only one PSD per subslice, so use the number of
+ * subslices for number of PSDs.
+ * - For max_wm_threads, the total should be PSD threads * #PSDs.
+ */
unsigned max_wm_threads;
+
+ /**
+ * Maximum Compute Shader threads.
+ *
+ * Thread count * number of EUs per subslice
+ */
unsigned max_cs_threads;
struct {
/**
- * Hardware default URB size. The units this is expressed in are
- * somewhat inconsistent: 512b units on Gen4-5, KB on Gen6-7, and KB
- * times the slice count on Gen8+.
+ * Hardware default URB size.
+ *
+ * The units this is expressed in are somewhat inconsistent: 512b units
+ * on Gen4-5, KB on Gen6-7, and KB times the slice count on Gen8+.
+ *
+ * Look up "URB Size" in the "Device Attributes" page, and take the
+ * maximum. Look up the slice count for each GT SKU on the same page.
+ * urb.size = URB Size (kbytes) / slice count
*/
unsigned size;
unsigned min_vs_entries;
};
const struct brw_device_info *brw_get_device_info(int devid);
+const char *brw_get_device_name(int devid);
* CSE can later notice that those loads are all the same and eliminate
* the redundant ones.
*/
- fs_reg vec4_offset = vgrf(glsl_type::int_type);
+ fs_reg vec4_offset = vgrf(glsl_type::uint_type);
bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
int scale = 1;
{
this->reg_offset = 0;
this->subreg_offset = 0;
- this->reladdr = NULL;
this->stride = 1;
if (this->file == IMM &&
(this->type != BRW_REGISTER_TYPE_V &&
{
return (this->backend_reg::equals(r) &&
subreg_offset == r.subreg_offset &&
- !reladdr && !r.reladdr &&
stride == r.stride);
}
assert(src[2].file == IMM);
unsigned region_length = src[2].ud;
- if (src[0].file == FIXED_GRF) {
+ if (src[0].file == UNIFORM) {
+ assert(region_length % 4 == 0);
+ return region_length / 4;
+ } else if (src[0].file == FIXED_GRF) {
/* If the start of the region is not register aligned, then
* there's some portion of the register that's technically
* unread at the beginning.
* unread portion at the beginning.
*/
if (src[0].subnr)
- region_length += src[0].subnr * type_sz(src[0].type);
+ region_length += src[0].subnr;
return DIV_ROUND_UP(region_length, REG_SIZE);
} else {
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
- this->param_size = v->param_size;
}
fs_reg *
* maximum number of fragment shader uniform components (64). If
* there are too many of these, they'd fill up all of register space.
* So, this will push some of them out to the pull constant buffer and
- * update the program to load them. We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
*/
void
fs_visitor::assign_constant_locations()
{
- /* Only the first compile (SIMD8 mode) gets to decide on locations. */
- if (dispatch_width != 8)
+ /* Only the first compile gets to decide on locations. */
+ if (dispatch_width != min_dispatch_width)
return;
- unsigned int num_pull_constants = 0;
-
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
bool is_live[uniforms];
memset(is_live, 0, sizeof(is_live));
+ /* For each uniform slot, a value of true indicates that the given slot and
+ * the next slot must remain contiguous. This is used to keep us from
+ * splitting arrays apart.
+ */
+ bool contiguous[uniforms];
+ memset(contiguous, 0, sizeof(contiguous));
+
/* First, we walk through the instructions and do two things:
*
* 1) Figure out which uniforms are live.
*
- * 2) Find all indirect access of uniform arrays and flag them as needing
- * to go into the pull constant buffer.
+ * 2) Mark any indirectly used ranges of registers as contiguous.
*
* Note that we don't move constant-indexed accesses to arrays. No
* testing has been done of the performance impact of this choice.
if (inst->src[i].file != UNIFORM)
continue;
- if (inst->src[i].reladdr) {
- int uniform = inst->src[i].nr;
+ int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
- /* If this array isn't already present in the pull constant buffer,
- * add it.
- */
- if (pull_constant_loc[uniform] == -1) {
- assert(param_size[uniform]);
- for (int j = 0; j < param_size[uniform]; j++)
- pull_constant_loc[uniform + j] = num_pull_constants++;
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+ assert(inst->src[2].ud % 4 == 0);
+ unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+ assert(last < uniforms);
+
+ for (unsigned j = constant_nr; j < last; j++) {
+ is_live[j] = true;
+ contiguous[j] = true;
}
+ is_live[last] = true;
} else {
- /* Mark the the one accessed uniform as live */
- int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
if (constant_nr >= 0 && constant_nr < (int) uniforms)
is_live[constant_nr] = true;
}
* If changing this value, note the limitation about total_regs in
* brw_curbe.c.
*/
- unsigned int max_push_components = 16 * 8;
+ const unsigned int max_push_components = 16 * 8;
+
+ /* For vulkan we don't limit the max_chunk_size. We set it to 32 float =
+ * 128 bytes, which is the maximum vulkan push constant size.
+ */
+ const unsigned int max_chunk_size = 32;
+
unsigned int num_push_constants = 0;
+ unsigned int num_pull_constants = 0;
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- for (unsigned int i = 0; i < uniforms; i++) {
- if (!is_live[i] || pull_constant_loc[i] != -1) {
- /* This UNIFORM register is either dead, or has already been demoted
- * to a pull const. Mark it as no longer living in the param[] array.
- */
- push_constant_loc[i] = -1;
+ int chunk_start = -1;
+ for (unsigned u = 0; u < uniforms; u++) {
+ push_constant_loc[u] = -1;
+ pull_constant_loc[u] = -1;
+
+ if (!is_live[u])
continue;
- }
- if (num_push_constants < max_push_components) {
- /* Retain as a push constant. Record the location in the params[]
- * array.
- */
- push_constant_loc[i] = num_push_constants++;
- } else {
- /* Demote to a pull constant. */
- push_constant_loc[i] = -1;
- pull_constant_loc[i] = num_pull_constants++;
+ /* This is the first live uniform in the chunk */
+ if (chunk_start < 0)
+ chunk_start = u;
+
+ /* If this element does not need to be contiguous with the next, we
+ * split at this point and everthing between chunk_start and u forms a
+ * single chunk.
+ */
+ if (!contiguous[u]) {
+ unsigned chunk_size = u - chunk_start + 1;
+
+ if (num_push_constants + chunk_size <= max_push_components &&
+ chunk_size <= max_chunk_size) {
+ for (unsigned j = chunk_start; j <= u; j++)
+ push_constant_loc[j] = num_push_constants++;
+ } else {
+ for (unsigned j = chunk_start; j <= u; j++)
+ pull_constant_loc[j] = num_pull_constants++;
+ }
+
+ chunk_start = -1;
}
}
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
*/
void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
{
- foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ /* Set up the annotation tracking for new generated instructions. */
+ const fs_builder ibld(this, block, inst);
+
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file != UNIFORM)
continue;
- int pull_index;
+ /* We'll handle this case later */
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+ continue;
+
unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
- if (location >= uniforms) /* Out of bounds access */
- pull_index = -1;
- else
- pull_index = pull_constant_loc[location];
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
if (pull_index == -1)
continue;
- /* Set up the annotation tracking for new generated instructions. */
- const fs_builder ibld(this, block, inst);
- const unsigned index = stage_prog_data->binding_table.pull_constants_start;
- fs_reg dst = vgrf(glsl_type::float_type);
-
assert(inst->src[i].stride == 0);
- /* Generate a pull load into dst. */
- if (inst->src[i].reladdr) {
- VARYING_PULL_CONSTANT_LOAD(ibld, dst,
- brw_imm_ud(index),
- *inst->src[i].reladdr,
- pull_index * 4);
- inst->src[i].reladdr = NULL;
- inst->src[i].stride = 1;
- } else {
- const fs_builder ubld = ibld.exec_all().group(8, 0);
- struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
- ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
- dst, brw_imm_ud(index), offset);
- inst->src[i].set_smear(pull_index & 3);
- }
- brw_mark_surface_used(prog_data, index);
+ fs_reg dst = vgrf(glsl_type::float_type);
+ const fs_builder ubld = ibld.exec_all().group(8, 0);
+ struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ dst, brw_imm_ud(index), offset);
/* Rewrite the instruction to use the temporary VGRF. */
inst->src[i].file = VGRF;
inst->src[i].nr = dst.nr;
inst->src[i].reg_offset = 0;
+ inst->src[i].set_smear(pull_index & 3);
+
+ brw_mark_surface_used(prog_data, index);
+ }
+
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+ inst->src[0].file == UNIFORM) {
+
+ unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
+
+ if (pull_index == -1)
+ continue;
+
+ VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+ brw_imm_ud(index),
+ inst->src[1],
+ pull_index * 4);
+ inst->remove(block);
+
+ brw_mark_surface_used(prog_data, index);
}
}
invalidate_live_intervals();
* we have enough space, but it will make sure the dead code eliminator kills
* the instruction that this will replace.
*/
- if (tex_inst->header_size != 0)
+ if (tex_inst->header_size != 0) {
+ invalidate_live_intervals();
return true;
+ }
fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
load_payload->sources + 1);
tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
tex_inst->src[0] = send_header;
+ invalidate_live_intervals();
return true;
}
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
int base_mrf = 1;
int color_mrf = base_mrf + 2;
+ fs_inst *mov;
- fs_inst *mov = bld.exec_all().group(4, 0)
- .MOV(brw_message_reg(color_mrf),
- fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ if (uniforms == 1) {
+ mov = bld.exec_all().group(4, 0)
+ .MOV(brw_message_reg(color_mrf),
+ fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ } else {
+ struct brw_reg reg =
+ brw_reg(BRW_GENERAL_REGISTER_FILE,
+ 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_8,
+ BRW_WIDTH_2,
+ BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+ mov = bld.exec_all().group(4, 0)
+ .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+ }
fs_inst *write;
if (key->nr_color_regions == 1) {
assign_curb_setup();
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
- assert(mov->src[0].file == FIXED_GRF);
- mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+ if (uniforms == 1) {
+ assert(mov->src[0].file == FIXED_GRF);
+ mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+ }
}
/**
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
return 8;
+ case SHADER_OPCODE_MOV_INDIRECT:
+ /* Prior to Broadwell, we only have 8 address subregisters */
+ return devinfo->gen < 8 ? 8 : inst->exec_size;
+
default:
return inst->exec_size;
}
break;
case UNIFORM:
fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
- if (inst->src[i].reladdr) {
- fprintf(file, "+reladdr");
- } else if (inst->src[i].subreg_offset) {
+ if (inst->src[i].subreg_offset) {
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
inst->src[i].subreg_offset);
}
{
if (end == start ||
end->is_partial_write() ||
- reg.reladdr ||
!reg.equals(end->dst)) {
return NULL;
} else {
bld = fs_builder(this, 64);
assign_constant_locations();
- demote_pull_constants();
+ lower_constant_loads();
validate();
void
fs_visitor::fixup_3src_null_dest()
{
+ bool progress = false;
+
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
if (inst->is_3src() && inst->dst.is_null()) {
inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
inst->dst.type);
+ progress = true;
}
}
+
+ if (progress)
+ invalidate_live_intervals();
}
void
* SIMD8. There's probably actually some intermediate point where
* SIMD16 with a couple of spills is still better.
*/
- if (dispatch_width == 16) {
+ if (dispatch_width == 16 && min_dispatch_width <= 8) {
fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else {
if (shader_time_index >= 0)
emit_shader_time_begin();
+ if (devinfo->is_haswell && prog_data->total_shared > 0) {
+ /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+ const fs_builder abld = bld.exec_all().group(1, 0);
+ abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+ suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+ }
+
emit_nir_code();
if (failed)
shader->info.cs.local_size[2];
unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+ unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
NULL, /* Never used in core profile */
shader, 8, shader_time_index);
- if (!v8.run_cs()) {
- fail_msg = v8.fail_msg;
- } else if (local_workgroup_size <= 8 * max_cs_threads) {
- cfg = v8.cfg;
- prog_data->simd_size = 8;
+ if (simd_required <= 8) {
+ if (!v8.run_cs()) {
+ fail_msg = v8.fail_msg;
+ } else {
+ cfg = v8.cfg;
+ prog_data->simd_size = 8;
+ }
}
fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
!fail_msg && !v8.simd16_unsupported &&
local_workgroup_size <= 16 * max_cs_threads) {
/* Try a SIMD16 compile */
- v16.import_uniforms(&v8);
+ if (simd_required <= 8)
+ v16.import_uniforms(&v8);
if (!v16.run_cs()) {
compiler->shader_perf_log(log_data,
"SIMD16 shader failed to compile: %s",
void split_virtual_grfs();
bool compact_virtual_grfs();
void assign_constant_locations();
- void demote_pull_constants();
+ void lower_constant_loads();
void invalidate_live_intervals();
void calculate_live_intervals();
void calculate_register_pressure();
void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
uint32_t spill_offset, int count);
void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
- uint32_t spill_offset, int count);
+ uint32_t spill_offset, int count, bool we_all);
void emit_nir_code();
void nir_setup_inputs();
void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
unsigned wr_mask);
+ bool optimize_extract_to_float(nir_alu_instr *instr,
+ const fs_reg &result);
bool optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result);
const struct brw_vue_map *input_vue_map;
- int *param_size;
-
int *virtual_grf_start;
int *virtual_grf_end;
brw::fs_live_variables *live_intervals;
bool spilled_any_registers;
const unsigned dispatch_width; /**< 8 or 16 */
+ unsigned min_dispatch_width;
int shader_time_index;
return;
uniforms = nir->num_uniforms / 4;
-
- nir_foreach_variable(var, &nir->uniforms) {
- /* UBO's and atomics don't take up space in the uniform file */
- if (var->interface_type != NULL || var->type->contains_atomic())
- continue;
-
- if (type_size_scalar(var->type) > 0)
- param_size[var->data.driver_location / 4] = type_size_scalar(var->type);
- }
}
static bool
}
}
+ /**
+ * Recognizes a parent instruction of nir_op_extract_* and changes the type to
+ * match instr.
+ */
+ bool
+ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
+ const fs_reg &result)
+ {
+ if (!instr->src[0].src.is_ssa ||
+ !instr->src[0].src.ssa->parent_instr)
+ return false;
+
+ if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+ return false;
+
+ nir_alu_instr *src0 =
+ nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+ if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
+ src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
+ return false;
+
+ nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
+ assert(element != NULL);
+
+ enum opcode extract_op;
+ if (src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16) {
+ assert(element->u[0] <= 1);
+ extract_op = SHADER_OPCODE_EXTRACT_WORD;
+ } else {
+ assert(element->u[0] <= 3);
+ extract_op = SHADER_OPCODE_EXTRACT_BYTE;
+ }
+
+ fs_reg op0 = get_nir_src(src0->src[0].src);
+ op0.type = brw_type_for_nir_type(nir_op_infos[src0->op].input_types[0]);
+ op0 = offset(op0, bld, src0->src[0].swizzle[0]);
+
+ set_saturate(instr->dest.saturate,
+ bld.emit(extract_op, result, op0, brw_imm_ud(element->u[0])));
+ return true;
+ }
+
bool
fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result)
switch (instr->op) {
case nir_op_i2f:
case nir_op_u2f:
+ if (optimize_extract_to_float(instr, result))
+ return;
+
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
inst->saturate = instr->dest.saturate;
break;
- case nir_op_fsin:
- inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
- inst->saturate = instr->dest.saturate;
+ case nir_op_fsin: {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
+ inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]);
+ if (instr->dest.saturate) {
+ inst->dst = result;
+ inst->saturate = true;
+ } else {
+ bld.MUL(result, tmp, brw_imm_f(0.99997));
+ }
break;
+ }
- case nir_op_fcos:
- inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
- inst->saturate = instr->dest.saturate;
+ case nir_op_fcos: {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
+ inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]);
+ if (instr->dest.saturate) {
+ inst->dst = result;
+ inst->saturate = true;
+ } else {
+ bld.MUL(result, tmp, brw_imm_f(0.99997));
+ }
break;
+ }
case nir_op_fddx:
if (fs_key->high_quality_derivatives) {
unreachable("Should have been lowered by borrow_to_arith().");
case nir_op_umod:
+ case nir_op_irem:
+ /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+ * appears that our hardware just does the right thing for signed
+ * remainder.
+ */
bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
break;
+ case nir_op_imod: {
+ /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
+ bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+
+ /* Math instructions don't support conditional mod */
+ inst = bld.MOV(bld.null_reg_d(), result);
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ /* Now, we need to determine if signs of the sources are different.
+ * When we XOR the sources, the top bit is 0 if they are the same and 1
+ * if they are different. We can then use a conditional modifier to
+ * turn that into a predicate. This leads us to an XOR.l instruction.
+ */
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
+ inst = bld.XOR(tmp, op[0], op[1]);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_L;
+
+ /* If the result of the initial remainder operation is non-zero and the
+ * two sources have different signs, add in a copy of op[1] to get the
+ * final integer modulus value.
+ */
+ inst = bld.ADD(result, result, op[1]);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+ }
+
case nir_op_flt:
case nir_op_ilt:
case nir_op_ult:
inst->saturate = instr->dest.saturate;
break;
+ case nir_op_fquantize2f16: {
+ fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
+ fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
+ fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+ /* The destination stride must be at least as big as the source stride. */
+ tmp16.type = BRW_REGISTER_TYPE_W;
+ tmp16.stride = 2;
+
+ /* Check for denormal */
+ fs_reg abs_src0 = op[0];
+ abs_src0.abs = true;
+ bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+ BRW_CONDITIONAL_L);
+ /* Get the appropriately signed zero */
+ bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
+ retype(op[0], BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x80000000));
+ /* Do the actual F32 -> F16 -> F32 conversion */
+ bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
+ bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
+ /* Select that or zero based on normal status */
+ inst = bld.SEL(result, zero, tmp32);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+
case nir_op_fmin:
case nir_op_imin:
case nir_op_umin:
{
fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
BRW_REGISTER_TYPE_UD);
+ fs_reg indirect;
+ unsigned indirect_max = 0;
for (const nir_deref *tail = &deref->deref; tail->child;
tail = tail->child) {
image = offset(image, bld, base * element_size);
if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
- fs_reg tmp = vgrf(glsl_type::int_type);
+ fs_reg tmp = vgrf(glsl_type::uint_type);
if (devinfo->gen == 7 && !devinfo->is_haswell) {
/* IVB hangs when trying to access an invalid surface index with
bld.MOV(tmp, get_nir_src(deref_array->indirect));
}
+ indirect_max += element_size * (tail->type->length - 1);
+
bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
- if (image.reladdr)
- bld.ADD(*image.reladdr, *image.reladdr, tmp);
- else
- image.reladdr = new(mem_ctx) fs_reg(tmp);
+ if (indirect.file == BAD_FILE) {
+ indirect = tmp;
+ } else {
+ bld.ADD(indirect, indirect, tmp);
+ }
}
}
- return image;
+ if (indirect.file == BAD_FILE) {
+ return image;
+ } else {
+ /* Emit a pile of MOVs to load the uniform into a temporary. The
+ * dead-code elimination pass will get rid of what we don't use.
+ */
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
+ for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+ offset(tmp, bld, j), offset(image, bld, j),
+ indirect, brw_imm_ud((indirect_max + 1) * 4));
+ }
+ return tmp;
+ }
}
void
nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
break;
+ case nir_intrinsic_load_shared: {
+ assert(devinfo->gen >= 7);
+
+ fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+ /* Get the offset to read from */
+ fs_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
+ } else {
+ offset_reg = vgrf(glsl_type::uint_type);
+ bld.ADD(offset_reg,
+ retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(instr->const_index[0]));
+ }
+
+ /* Read the vector */
+ fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+ 1 /* dims */,
+ instr->num_components,
+ BRW_PREDICATE_NONE);
+ read_result.type = dest.type;
+ for (int i = 0; i < instr->num_components; i++)
+ bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
+
+ break;
+ }
+
+ case nir_intrinsic_store_shared: {
+ assert(devinfo->gen >= 7);
+
+ /* Block index */
+ fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+ /* Value */
+ fs_reg val_reg = get_nir_src(instr->src[0]);
+
+ /* Writemask */
+ unsigned writemask = instr->const_index[1];
+
+ /* Combine groups of consecutive enabled channels in one write
+ * message. We use ffs to find the first enabled channel and then ffs on
+ * the bit-inverse, down-shifted writemask to determine the length of
+ * the block of enabled bits.
+ */
+ while (writemask) {
+ unsigned first_component = ffs(writemask) - 1;
+ unsigned length = ffs(~(writemask >> first_component)) - 1;
+ fs_reg offset_reg;
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
+ 4 * first_component);
+ } else {
+ offset_reg = vgrf(glsl_type::uint_type);
+ bld.ADD(offset_reg,
+ retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(instr->const_index[0] + 4 * first_component));
+ }
+
+ emit_untyped_write(bld, surf_index, offset_reg,
+ offset(val_reg, bld, first_component),
+ 1 /* dims */, length,
+ BRW_PREDICATE_NONE);
+
+ /* Clear the bits in the writemask that we just wrote, then try
+ * again to see if more channels are left.
+ */
+ writemask &= (15 << (first_component + length));
+ }
+
+ break;
+ }
+
default:
nir_emit_intrinsic(bld, instr);
break;
case nir_intrinsic_atomic_counter_inc:
case nir_intrinsic_atomic_counter_dec:
case nir_intrinsic_atomic_counter_read: {
- using namespace surface_access;
-
/* Get the arguments of the atomic intrinsic. */
const fs_reg offset = get_nir_src(instr->src[0]);
const unsigned surface = (stage_prog_data->binding_table.abo_start +
/* Offsets are in bytes but they should always be multiples of 4 */
assert(const_offset->u[0] % 4 == 0);
src.reg_offset = const_offset->u[0] / 4;
+
+ for (unsigned j = 0; j < instr->num_components; j++) {
+ bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+ }
} else {
- src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
- }
+ fs_reg indirect = retype(get_nir_src(instr->src[0]),
+ BRW_REGISTER_TYPE_UD);
- for (unsigned j = 0; j < instr->num_components; j++) {
- bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+ /* We need to pass a size to the MOV_INDIRECT but we don't want it to
+ * go past the end of the uniform. In order to keep the n'th
+ * component from running past, we subtract off the size of all but
+ * one component of the vector.
+ */
+ assert(instr->const_index[1] >= instr->num_components * 4);
+ unsigned read_size = instr->const_index[1] -
+ (instr->num_components - 1) * 4;
+
+ for (unsigned j = 0; j < instr->num_components; j++) {
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+ offset(dest, bld, j), offset(src, bld, j),
+ indirect, brw_imm_ud(read_size));
+ }
}
break;
}
break;
}
- case nir_intrinsic_load_shared: {
- assert(devinfo->gen >= 7);
-
- fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
- /* Get the offset to read from */
- fs_reg offset_reg;
- nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
- if (const_offset) {
- offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
- } else {
- offset_reg = vgrf(glsl_type::uint_type);
- bld.ADD(offset_reg,
- retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
- brw_imm_ud(instr->const_index[0]));
- }
-
- /* Read the vector */
- fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
- 1 /* dims */,
- instr->num_components,
- BRW_PREDICATE_NONE);
- read_result.type = dest.type;
- for (int i = 0; i < instr->num_components; i++)
- bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
-
- break;
- }
-
- case nir_intrinsic_store_shared: {
- assert(devinfo->gen >= 7);
-
- /* Block index */
- fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
- /* Value */
- fs_reg val_reg = get_nir_src(instr->src[0]);
-
- /* Writemask */
- unsigned writemask = instr->const_index[1];
-
- /* Combine groups of consecutive enabled channels in one write
- * message. We use ffs to find the first enabled channel and then ffs on
- * the bit-inverse, down-shifted writemask to determine the length of
- * the block of enabled bits.
- */
- while (writemask) {
- unsigned first_component = ffs(writemask) - 1;
- unsigned length = ffs(~(writemask >> first_component)) - 1;
- fs_reg offset_reg;
-
- nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
- if (const_offset) {
- offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
- 4 * first_component);
- } else {
- offset_reg = vgrf(glsl_type::uint_type);
- bld.ADD(offset_reg,
- retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
- brw_imm_ud(instr->const_index[0] + 4 * first_component));
- }
-
- emit_untyped_write(bld, surf_index, offset_reg,
- offset(val_reg, bld, first_component),
- 1 /* dims */, length,
- BRW_PREDICATE_NONE);
-
- /* Clear the bits in the writemask that we just wrote, then try
- * again to see if more channels are left.
- */
- writemask &= (15 << (first_component + length));
- }
-
- break;
- }
-
case nir_intrinsic_load_input: {
fs_reg src;
if (stage == MESA_SHADER_VERTEX) {
/* Emit the actual atomic operation operation */
- fs_reg atomic_result =
- surface_access::emit_untyped_atomic(bld, surface, offset,
- data1, data2,
- 1 /* dims */, 1 /* rsize */,
- op,
- BRW_PREDICATE_NONE);
+ fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+ data1, data2,
+ 1 /* dims */, 1 /* rsize */,
+ op,
+ BRW_PREDICATE_NONE);
dest.type = atomic_result.type;
bld.MOV(dest, atomic_result);
}
/* Emit the actual atomic operation operation */
- fs_reg atomic_result =
- surface_access::emit_untyped_atomic(bld, surface, offset,
- data1, data2,
- 1 /* dims */, 1 /* rsize */,
- op,
- BRW_PREDICATE_NONE);
+ fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+ data1, data2,
+ 1 /* dims */, 1 /* rsize */,
+ op,
+ BRW_PREDICATE_NONE);
dest.type = atomic_result.type;
bld.MOV(dest, atomic_result);
}
fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
+ /* Our hardware requires a LOD for buffer textures */
+ if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+ lod = brw_imm_d(0);
+
for (unsigned i = 0; i < instr->num_srcs; i++) {
fs_reg src = get_nir_src(instr->src[i].src);
switch (instr->src[i].src_type) {
unreachable("unhandled shader stage");
}
+ if (stage == MESA_SHADER_COMPUTE) {
+ const brw_cs_prog_data *cs_prog_data =
+ (const brw_cs_prog_data *) prog_data;
+ unsigned size = cs_prog_data->local_size[0] *
+ cs_prog_data->local_size[1] *
+ cs_prog_data->local_size[2];
+ size = DIV_ROUND_UP(size, devinfo->max_cs_threads);
+ min_dispatch_width = size > 16 ? 32 : (size > 8 ? 16 : 8);
+ } else {
+ min_dispatch_width = 8;
+ }
+
this->prog_data = this->stage_prog_data;
this->failed = false;
this->spilled_any_registers = false;
this->do_dual_src = false;
-
- if (dispatch_width == 8)
- this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
}
fs_visitor::~fs_visitor()
!reladdr && !r.reladdr);
}
+ bool
+ vec4_visitor::vectorize_mov(bblock_t *block, vec4_instruction *inst,
+ uint8_t imm[4], vec4_instruction *imm_inst[4],
+ int inst_count, unsigned writemask)
+ {
+ if (inst_count < 2)
+ return false;
+
+ unsigned vf;
+ memcpy(&vf, imm, sizeof(vf));
+ vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+ mov->dst.type = BRW_REGISTER_TYPE_F;
+ mov->dst.writemask = writemask;
+ inst->insert_before(block, mov);
+
+ for (int i = 0; i < inst_count; i++) {
+ imm_inst[i]->remove(block);
+ }
+
+ return true;
+ }
+
bool
vec4_visitor::opt_vector_float()
{
int last_reg = -1, last_reg_offset = -1;
enum brw_reg_file last_reg_file = BAD_FILE;
- int remaining_channels = 0;
- uint8_t imm[4];
+ uint8_t imm[4] = { 0 };
int inst_count = 0;
vec4_instruction *imm_inst[4];
+ unsigned writemask = 0;
foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
if (last_reg != inst->dst.nr ||
last_reg_offset != inst->dst.reg_offset ||
last_reg_file != inst->dst.file) {
+ progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count,
+ writemask);
+ inst_count = 0;
+ writemask = 0;
last_reg = inst->dst.nr;
last_reg_offset = inst->dst.reg_offset;
last_reg_file = inst->dst.file;
- remaining_channels = WRITEMASK_XYZW;
- inst_count = 0;
+ for (int i = 0; i < 4; i++) {
+ imm[i] = 0;
+ }
}
if (inst->opcode != BRW_OPCODE_MOV ||
inst->dst.writemask == WRITEMASK_XYZW ||
- inst->src[0].file != IMM)
+ inst->src[0].file != IMM ||
+ inst->predicate != BRW_PREDICATE_NONE) {
+ progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count,
+ writemask);
+ inst_count = 0;
+ last_reg = -1;
continue;
+ }
int vf = brw_float_to_vf(inst->src[0].f);
if (vf == -1)
if ((inst->dst.writemask & WRITEMASK_W) != 0)
imm[3] = vf;
+ writemask |= inst->dst.writemask;
imm_inst[inst_count++] = inst;
-
- remaining_channels &= ~inst->dst.writemask;
- if (remaining_channels == 0) {
- unsigned vf;
- memcpy(&vf, imm, sizeof(vf));
- vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf));
- mov->dst.type = BRW_REGISTER_TYPE_F;
- mov->dst.writemask = WRITEMASK_XYZW;
- inst->insert_after(block, mov);
- last_reg = -1;
-
- for (int i = 0; i < inst_count; i++) {
- imm_inst[i]->remove(block);
- }
- progress = true;
- }
}
if (progress)
inst->src[i].reg_offset = 0;
}
}
-
- /* Update that everything is now vector-sized. */
- for (int i = 0; i < this->uniforms; i++) {
- this->uniform_size[i] = 1;
- }
}
void
* push constants.
*/
for (int src = 0; src < uniforms; src++) {
- assert(src < uniform_array_size);
int size = chans_used[src];
if (size == 0)
dst_reg temp = dst_reg(this, glsl_type::vec4_type);
emit_pull_constant_load(block, inst, temp, inst->src[i],
- pull_constant_loc[uniform]);
+ pull_constant_loc[uniform], src_reg());
inst->src[i].file = temp.file;
inst->src[i].nr = temp.nr;
if (is_nop_mov) {
inst->remove(block);
+ progress = true;
continue;
}
}
* matter what, or the GPU would hang.
*/
if (devinfo->gen < 6 && this->uniforms == 0) {
- assert(this->uniforms < this->uniform_array_size);
-
stage_prog_data->param =
reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
for (unsigned int i = 0; i < 4; i++) {
*/
dst_reg output_reg[BRW_VARYING_SLOT_COUNT];
const char *output_reg_annotation[BRW_VARYING_SLOT_COUNT];
- int *uniform_size;
- int uniform_array_size; /*< Size of the uniform_size array */
int uniforms;
src_reg shader_start_time;
void emit_shader_time_end();
void emit_shader_time_write(int shader_time_subindex, src_reg value);
- void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
- dst_reg dst, src_reg offset, src_reg src0,
- src_reg src1);
-
- void emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
- src_reg offset);
-
src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
src_reg *reladdr, int reg_offset);
- src_reg get_pull_constant_offset(bblock_t *block, vec4_instruction *inst,
- src_reg *reladdr, int reg_offset);
void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
- int base_offset);
+ int base_offset,
+ src_reg indirect);
void emit_pull_constant_load_reg(dst_reg dst,
src_reg surf_index,
src_reg offset,
virtual void gs_end_primitive();
private:
+ bool vectorize_mov(bblock_t *block, vec4_instruction *inst,
+ uint8_t imm[4], vec4_instruction *imm_inst[4],
+ int inst_count, unsigned writemask);
+
/**
* If true, then register allocation should fail instead of spilling.
*/
vec4_visitor::nir_setup_uniforms()
{
uniforms = nir->num_uniforms / 16;
-
- nir_foreach_variable(var, &nir->uniforms) {
- /* UBO's and atomics don't take up space in the uniform file */
- if (var->interface_type != NULL || var->type->contains_atomic())
- continue;
-
- if (type_size_vec4(var->type) > 0)
- uniform_size[var->data.driver_location / 16] = type_size_vec4(var->type);
- }
}
void
/* Offsets are in bytes but they should always be multiples of 16 */
assert(const_offset->u[0] % 16 == 0);
src.reg_offset = const_offset->u[0] / 16;
+
+ emit(MOV(dest, src));
} else {
- src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
- src.reladdr = new(mem_ctx) src_reg(tmp);
- }
+ src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
- emit(MOV(dest, src));
+ emit(SHADER_OPCODE_MOV_INDIRECT, dest, src,
+ indirect, brw_imm_ud(instr->const_index[1]));
+ }
break;
}
(unsigned) instr->const_index[0];
src_reg offset = get_nir_src(instr->src[0], nir_type_int,
instr->num_components);
+ const src_reg surface = brw_imm_ud(surf_index);
+ const vec4_builder bld =
+ vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+ src_reg tmp;
+
dest = get_nir_dest(instr->dest);
switch (instr->intrinsic) {
- case nir_intrinsic_atomic_counter_inc:
- emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
- src_reg(), src_reg());
- break;
- case nir_intrinsic_atomic_counter_dec:
- emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
- src_reg(), src_reg());
- break;
- case nir_intrinsic_atomic_counter_read:
- emit_untyped_surface_read(surf_index, dest, offset);
- break;
- default:
- unreachable("Unreachable");
+ case nir_intrinsic_atomic_counter_inc:
+ tmp = emit_untyped_atomic(bld, surface, offset,
+ src_reg(), src_reg(),
+ 1, 1,
+ BRW_AOP_INC);
+ break;
+ case nir_intrinsic_atomic_counter_dec:
+ tmp = emit_untyped_atomic(bld, surface, offset,
+ src_reg(), src_reg(),
+ 1, 1,
+ BRW_AOP_PREDEC);
+ break;
+ case nir_intrinsic_atomic_counter_read:
+ tmp = emit_untyped_read(bld, surface, offset, 1, 1);
+ break;
+ default:
+ unreachable("Unreachable");
}
+ bld.MOV(retype(dest, tmp.type), tmp);
brw_mark_surface_used(stage_prog_data, surf_index);
break;
}
const vec4_builder bld =
vec4_builder(this).at_end().annotate(current_annotation, base_ir);
- src_reg atomic_result =
- surface_access::emit_untyped_atomic(bld, surface, offset,
- data1, data2,
- 1 /* dims */, 1 /* rsize */,
- op,
- BRW_PREDICATE_NONE);
+ src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+ data1, data2,
+ 1 /* dims */, 1 /* rsize */,
+ op,
+ BRW_PREDICATE_NONE);
dest.type = atomic_result.type;
bld.MOV(dest, atomic_result);
}
inst->saturate = instr->dest.saturate;
break;
- case nir_op_fsin:
- inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
- inst->saturate = instr->dest.saturate;
+ case nir_op_fsin: {
+ src_reg tmp = src_reg(this, glsl_type::vec4_type);
+ inst = emit_math(SHADER_OPCODE_SIN, dst_reg(tmp), op[0]);
+ if (instr->dest.saturate) {
+ inst->dst = dst;
+ inst->saturate = true;
+ } else {
+ emit(MUL(dst, tmp, brw_imm_f(0.99997)));
+ }
break;
+ }
- case nir_op_fcos:
- inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
- inst->saturate = instr->dest.saturate;
+ case nir_op_fcos: {
+ src_reg tmp = src_reg(this, glsl_type::vec4_type);
+ inst = emit_math(SHADER_OPCODE_COS, dst_reg(tmp), op[0]);
+ if (instr->dest.saturate) {
+ inst->dst = dst;
+ inst->saturate = true;
+ } else {
+ emit(MUL(dst, tmp, brw_imm_f(0.99997)));
+ }
break;
+ }
case nir_op_idiv:
case nir_op_udiv:
break;
case nir_op_umod:
+ case nir_op_irem:
+ /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+ * appears that our hardware just does the right thing for signed
+ * remainder.
+ */
emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
break;
+ case nir_op_imod: {
+ /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
+ inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+
+ /* Math instructions don't support conditional mod */
+ inst = emit(MOV(dst_null_d(), src_reg(dst)));
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ /* Now, we need to determine if signs of the sources are different.
+ * When we XOR the sources, the top bit is 0 if they are the same and 1
+ * if they are different. We can then use a conditional modifier to
+ * turn that into a predicate. This leads us to an XOR.l instruction.
+ */
+ src_reg tmp = src_reg(this, glsl_type::ivec4_type);
+ inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_L;
+
+ /* If the result of the initial remainder operation is non-zero and the
+ * two sources have different signs, add in a copy of op[1] to get the
+ * final integer modulus value.
+ */
+ inst = emit(ADD(dst, src_reg(dst), op[1]));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+ }
+
case nir_op_ldexp:
unreachable("not reached: should be handled by ldexp_to_arith()");
inst->saturate = instr->dest.saturate;
break;
+ case nir_op_fquantize2f16: {
+ /* See also vec4_visitor::emit_pack_half_2x16() */
+ src_reg tmp16 = src_reg(this, glsl_type::uvec4_type);
+ src_reg tmp32 = src_reg(this, glsl_type::vec4_type);
+ src_reg zero = src_reg(this, glsl_type::vec4_type);
+
+ /* Check for denormal */
+ src_reg abs_src0 = op[0];
+ abs_src0.abs = true;
+ emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+ BRW_CONDITIONAL_L));
+ /* Get the appropriately signed zero */
+ emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD),
+ retype(op[0], BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x80000000)));
+ /* Do the actual F32 -> F16 -> F32 conversion */
+ emit(F32TO16(dst_reg(tmp16), op[0]));
+ emit(F16TO32(dst_reg(tmp32), tmp16));
+ /* Select that or zero based on normal status */
+ inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->predicate_inverse = true;
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+
case nir_op_fmin:
case nir_op_imin:
case nir_op_umin:
break;
case nir_jump_return:
- /* fall through */
default:
unreachable("unknown jump");
}
nir_tex_instr_dest_size(instr));
dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
+ /* Our hardware requires a LOD for buffer textures */
+ if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+ lod = brw_imm_d(0);
+
/* Load the texture operation sources */
uint32_t constant_offset = 0;
for (unsigned i = 0; i < instr->num_srcs; i++) {
unreachable("not reached");
}
- void
- vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
- dst_reg dst, src_reg surf_offset,
- src_reg src0, src_reg src1)
- {
- unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
- src_reg src_payload(this, glsl_type::uint_type, mlen);
- dst_reg payload(src_payload);
- payload.writemask = WRITEMASK_X;
-
- /* Set the atomic operation offset. */
- emit(MOV(offset(payload, 0), surf_offset));
- unsigned i = 1;
-
- /* Set the atomic operation arguments. */
- if (src0.file != BAD_FILE) {
- emit(MOV(offset(payload, i), src0));
- i++;
- }
-
- if (src1.file != BAD_FILE) {
- emit(MOV(offset(payload, i), src1));
- i++;
- }
-
- /* Emit the instruction. Note that this maps to the normal SIMD8
- * untyped atomic message on Ivy Bridge, but that's OK because
- * unused channels will be masked out.
- */
- vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
- src_payload,
- brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
- inst->mlen = mlen;
- }
-
- void
- vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
- src_reg surf_offset)
- {
- dst_reg offset(this, glsl_type::uint_type);
- offset.writemask = WRITEMASK_X;
-
- /* Set the surface read offset. */
- emit(MOV(offset, surf_offset));
-
- /* Emit the instruction. Note that this maps to the normal SIMD8
- * untyped surface read message, but that's OK because unused
- * channels will be masked out.
- */
- vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
- src_reg(offset),
- brw_imm_ud(surf_index), brw_imm_d(1));
- inst->mlen = 1;
- }
-
void
vec4_visitor::emit_ndc_computation()
{
}
}
-src_reg
-vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
- src_reg *reladdr, int reg_offset)
-{
- if (reladdr) {
- src_reg index = src_reg(this, glsl_type::int_type);
-
- emit_before(block, inst, ADD(dst_reg(index), *reladdr,
- brw_imm_d(reg_offset * 16)));
-
- return index;
- } else if (devinfo->gen >= 8) {
- /* Store the offset in a GRF so we can send-from-GRF. */
- src_reg offset = src_reg(this, glsl_type::int_type);
- emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
- return offset;
- } else {
- return brw_imm_d(reg_offset * 16);
- }
-}
-
/**
* Emits an instruction before @inst to load the value named by @orig_src
* from scratch space at @base_offset to @temp.
void
vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
dst_reg temp, src_reg orig_src,
- int base_offset)
+ int base_offset, src_reg indirect)
{
int reg_offset = base_offset + orig_src.reg_offset;
const unsigned index = prog_data->base.binding_table.pull_constants_start;
- src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
- reg_offset);
+
+ src_reg offset;
+ if (indirect.file != BAD_FILE) {
+ offset = src_reg(this, glsl_type::int_type);
+
+ emit_before(block, inst, ADD(dst_reg(offset), indirect,
+ brw_imm_d(reg_offset * 16)));
+ } else if (devinfo->gen >= 8) {
+ /* Store the offset in a GRF so we can send-from-GRF. */
+ offset = src_reg(this, glsl_type::int_type);
+ emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
+ } else {
+ offset = brw_imm_d(reg_offset * 16);
+ }
emit_pull_constant_load_reg(temp,
brw_imm_ud(index),
{
int pull_constant_loc[this->uniforms];
memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
- bool nested_reladdr;
- /* Walk through and find array access of uniforms. Put a copy of that
- * uniform in the pull constant buffer.
- *
- * Note that we don't move constant-indexed accesses to arrays. No
- * testing has been done of the performance impact of this choice.
+ /* First, walk through the instructions and determine which things need to
+ * be pulled. We mark something as needing to be pulled by setting
+ * pull_constant_loc to 0.
*/
- do {
- nested_reladdr = false;
-
- foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
- for (int i = 0 ; i < 3; i++) {
- if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
- continue;
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ /* We only care about MOV_INDIRECT of a uniform */
+ if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+ inst->src[0].file != UNIFORM)
+ continue;
- int uniform = inst->src[i].nr;
+ int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
- if (inst->src[i].reladdr->reladdr)
- nested_reladdr = true; /* will need another pass */
+ for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
+ pull_constant_loc[uniform_nr + j] = 0;
+ }
- /* If this array isn't already present in the pull constant buffer,
- * add it.
- */
- if (pull_constant_loc[uniform] == -1) {
- const gl_constant_value **values =
- &stage_prog_data->param[uniform * 4];
+ /* Next, we walk the list of uniforms and assign real pull constant
+ * locations and set their corresponding entries in pull_param.
+ */
+ for (int j = 0; j < this->uniforms; j++) {
+ if (pull_constant_loc[j] < 0)
+ continue;
- pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
+ pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
- assert(uniform < uniform_array_size);
- for (int j = 0; j < uniform_size[uniform] * 4; j++) {
- stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
- = values[j];
- }
- }
+ for (int i = 0; i < 4; i++) {
+ stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
+ = stage_prog_data->param[j * 4 + i];
+ }
+ }
- /* Set up the annotation tracking for new generated instructions. */
- base_ir = inst->ir;
- current_annotation = inst->annotation;
+ /* Finally, we can walk through the instructions and lower MOV_INDIRECT
+ * instructions to actual uniform pulls.
+ */
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ /* We only care about MOV_INDIRECT of a uniform */
+ if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+ inst->src[0].file != UNIFORM)
+ continue;
- dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+ int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
- emit_pull_constant_load(block, inst, temp, inst->src[i],
- pull_constant_loc[uniform]);
+ assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
- inst->src[i].file = temp.file;
- inst->src[i].nr = temp.nr;
- inst->src[i].reg_offset = temp.reg_offset;
- inst->src[i].reladdr = NULL;
- }
- }
- } while (nested_reladdr);
+ emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
+ pull_constant_loc[uniform_nr], inst->src[1]);
+ inst->remove(block);
+ }
/* Now there are no accesses of the UNIFORM file with a reladdr, so
* no need to track them as larger-than-vec4 objects. This will be
this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
this->uniforms = 0;
-
- /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
- * at least one. See setup_uniforms() in brw_vec4.cpp.
- */
- this->uniform_array_size = 1;
- if (prog_data) {
- this->uniform_array_size =
- MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
- }
-
- this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
}
vec4_visitor::~vec4_visitor()
*/
GLuint Binding;
+ /**
+ * Vulkan descriptor set qualifier for this block.
+ */
+ GLuint Set;
+
/**
* Minimum size (in bytes) of a buffer object to back this uniform buffer
* (GL_UNIFORM_BLOCK_DATA_SIZE).
GLboolean ARB_indirect_parameters;
GLboolean ARB_instanced_arrays;
GLboolean ARB_internalformat_query;
+ GLboolean ARB_internalformat_query2;
GLboolean ARB_map_buffer_range;
GLboolean ARB_occlusion_query;
GLboolean ARB_occlusion_query2;
GLboolean ARB_query_buffer_object;
GLboolean ARB_sample_shading;
GLboolean ARB_seamless_cube_map;
+ GLboolean ARB_shader_atomic_counter_ops;
GLboolean ARB_shader_atomic_counters;
GLboolean ARB_shader_bit_encoding;
GLboolean ARB_shader_clock;