anv: pCreateInfo->pApplicationInfo parameter to vkCreateInstance may be NULL
[mesa.git] / src / vulkan / anv_device.c
index 0043bea5d13d6f8dc1feed548563f5fbd2dcca1e..68639068324bcf8a8e7b3506c43139301b22060e 100644 (file)
@@ -30,6 +30,7 @@
 #include "anv_private.h"
 #include "mesa/main/git_sha1.h"
 #include "util/strtod.h"
+#include "util/debug.h"
 
 #include "gen7_pack.h"
 
@@ -89,12 +90,9 @@ anv_physical_device_init(struct anv_physical_device *device,
       fprintf(stderr, "WARNING: Ivy Bridge Vulkan support is incomplete\n");
    } else if (device->info->gen == 7 && device->info->is_baytrail) {
       fprintf(stderr, "WARNING: Bay Trail Vulkan support is incomplete\n");
-   } else if (device->info->gen == 9 && !device->info->is_broxton) {
-      fprintf(stderr, "WARNING: Skylake Vulkan support is incomplete\n");
-   } else if (device->info->gen == 9 && device->info->is_broxton) {
-      fprintf(stderr, "WARNING: Broxton Vulkan support is incomplete\n");
-   } else if (device->info->gen == 8) {
-      /* Broadwell/Cherryview is as fully supported as anything */
+   } else if (device->info->gen >= 8) {
+      /* Broadwell, Cherryview, Skylake, Broxton, Kabylake is as fully
+       * supported as anything */
    } else {
       result = vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER,
                          "Vulkan not yet supported on %s", device->name);
@@ -126,6 +124,8 @@ anv_physical_device_init(struct anv_physical_device *device,
       goto fail;
    }
 
+   bool swizzled = anv_gem_get_bit6_swizzle(fd, I915_TILING_X);
+
    close(fd);
 
    brw_process_intel_debug_variable();
@@ -138,7 +138,8 @@ anv_physical_device_init(struct anv_physical_device *device,
    device->compiler->shader_debug_log = compiler_debug_log;
    device->compiler->shader_perf_log = compiler_perf_log;
 
-   isl_device_init(&device->isl_dev, device->info);
+   /* XXX: Actually detect bit6 swizzling */
+   isl_device_init(&device->isl_dev, device->info, swizzled);
 
    return VK_SUCCESS;
 
@@ -156,7 +157,7 @@ anv_physical_device_finish(struct anv_physical_device *device)
 static const VkExtensionProperties global_extensions[] = {
    {
       .extensionName = VK_KHR_SURFACE_EXTENSION_NAME,
-      .specVersion = 24,
+      .specVersion = 25,
    },
    {
       .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME,
@@ -213,10 +214,19 @@ VkResult anv_CreateInstance(
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO);
 
-   if (pCreateInfo->pApplicationInfo->apiVersion != VK_MAKE_VERSION(0, 210, 1))
-      return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+   uint32_t client_version = pCreateInfo->pApplicationInfo ?
+                             pCreateInfo->pApplicationInfo->apiVersion :
+                             VK_MAKE_VERSION(1, 0, 0);
+   if (VK_MAKE_VERSION(1, 0, 0) > client_version ||
+       client_version > VK_MAKE_VERSION(1, 0, 3)) {
+      return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER,
+                       "Client requested version %d.%d.%d",
+                       VK_VERSION_MAJOR(client_version),
+                       VK_VERSION_MINOR(client_version),
+                       VK_VERSION_PATCH(client_version));
+   }
 
-   for (uint32_t i = 0; i < pCreateInfo->enabledExtensionNameCount; i++) {
+   for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
       bool found = false;
       for (uint32_t j = 0; j < ARRAY_SIZE(global_extensions); j++) {
          if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
@@ -230,7 +240,7 @@ VkResult anv_CreateInstance(
    }
 
    instance = anv_alloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
-                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+                         VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
    if (!instance)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -241,7 +251,7 @@ VkResult anv_CreateInstance(
    else
       instance->alloc = default_alloc;
 
-   instance->apiVersion = pCreateInfo->pApplicationInfo->apiVersion;
+   instance->apiVersion = client_version;
    instance->physicalDeviceCount = -1;
 
    _mesa_locale_init();
@@ -332,7 +342,7 @@ void anv_GetPhysicalDeviceFeatures(
    anv_finishme("Get correct values for PhysicalDeviceFeatures");
 
    *pFeatures = (VkPhysicalDeviceFeatures) {
-      .robustBufferAccess                       = false,
+      .robustBufferAccess                       = true,
       .fullDrawIndexUint32                      = false,
       .imageCubeArray                           = false,
       .independentBlend                         = false,
@@ -341,7 +351,8 @@ void anv_GetPhysicalDeviceFeatures(
       .sampleRateShading                        = false,
       .dualSrcBlend                             = true,
       .logicOp                                  = true,
-      .multiDrawIndirect                        = true,
+      .multiDrawIndirect                        = false,
+      .drawIndirectFirstInstance                = false,
       .depthClamp                               = false,
       .depthBiasClamp                           = false,
       .fillModeNonSolid                         = true,
@@ -375,9 +386,17 @@ void anv_GetPhysicalDeviceFeatures(
       .shaderInt16                              = false,
       .alphaToOne                               = true,
       .variableMultisampleRate                  = false,
+      .inheritedQueries                         = false,
    };
 }
 
+void
+anv_device_get_cache_uuid(void *uuid)
+{
+   memset(uuid, 0, VK_UUID_SIZE);
+   snprintf(uuid, VK_UUID_SIZE, "anv-%s", MESA_GIT_SHA1 + 4);
+}
+
 void anv_GetPhysicalDeviceProperties(
     VkPhysicalDevice                            physicalDevice,
     VkPhysicalDeviceProperties*                 pProperties)
@@ -387,11 +406,10 @@ void anv_GetPhysicalDeviceProperties(
 
    anv_finishme("Get correct values for VkPhysicalDeviceLimits");
 
+   const float time_stamp_base = devinfo->gen >= 9 ? 83.333 : 80.0;
+
    VkSampleCountFlags sample_counts =
-      VK_SAMPLE_COUNT_1_BIT |
-      VK_SAMPLE_COUNT_2_BIT |
-      VK_SAMPLE_COUNT_4_BIT |
-      VK_SAMPLE_COUNT_8_BIT;
+      isl_device_get_sample_counts(&pdevice->isl_dev);
 
    VkPhysicalDeviceLimits limits = {
       .maxImageDimension1D                      = (1 << 14),
@@ -399,12 +417,12 @@ void anv_GetPhysicalDeviceProperties(
       .maxImageDimension3D                      = (1 << 10),
       .maxImageDimensionCube                    = (1 << 14),
       .maxImageArrayLayers                      = (1 << 10),
-      .maxTexelBufferElements                   = (1 << 14),
+      .maxTexelBufferElements                   = 128 * 1024 * 1024,
       .maxUniformBufferRange                    = UINT32_MAX,
       .maxStorageBufferRange                    = UINT32_MAX,
       .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
       .maxMemoryAllocationCount                 = UINT32_MAX,
-      .maxSamplerAllocationCount                = UINT32_MAX,
+      .maxSamplerAllocationCount                = 64 * 1024,
       .bufferImageGranularity                   = 64, /* A cache line */
       .sparseAddressSpaceSize                   = 0,
       .maxBoundDescriptorSets                   = MAX_SETS,
@@ -425,9 +443,9 @@ void anv_GetPhysicalDeviceProperties(
       .maxDescriptorSetInputAttachments         = 256,
       .maxVertexInputAttributes                 = 32,
       .maxVertexInputBindings                   = 32,
-      .maxVertexInputAttributeOffset            = 256,
-      .maxVertexInputBindingStride              = 256,
-      .maxVertexOutputComponents                = 32,
+      .maxVertexInputAttributeOffset            = 2047,
+      .maxVertexInputBindingStride              = 2048,
+      .maxVertexOutputComponents                = 128,
       .maxTessellationGenerationLevel           = 0,
       .maxTessellationPatchSize                 = 0,
       .maxTessellationControlPerVertexInputComponents = 0,
@@ -436,21 +454,17 @@ void anv_GetPhysicalDeviceProperties(
       .maxTessellationControlTotalOutputComponents = 0,
       .maxTessellationEvaluationInputComponents = 0,
       .maxTessellationEvaluationOutputComponents = 0,
-      .maxGeometryShaderInvocations             = 6,
-      .maxGeometryInputComponents               = 16,
-      .maxGeometryOutputComponents              = 16,
-      .maxGeometryOutputVertices                = 16,
-      .maxGeometryTotalOutputComponents         = 16,
-      .maxFragmentInputComponents               = 16,
+      .maxGeometryShaderInvocations             = 32,
+      .maxGeometryInputComponents               = 64,
+      .maxGeometryOutputComponents              = 128,
+      .maxGeometryOutputVertices                = 256,
+      .maxGeometryTotalOutputComponents         = 1024,
+      .maxFragmentInputComponents               = 128,
       .maxFragmentOutputAttachments             = 8,
       .maxFragmentDualSrcAttachments            = 2,
       .maxFragmentCombinedOutputResources       = 8,
-      .maxComputeSharedMemorySize               = 1024,
-      .maxComputeWorkGroupCount = {
-         16 * devinfo->max_cs_threads,
-         16 * devinfo->max_cs_threads,
-         16 * devinfo->max_cs_threads,
-      },
+      .maxComputeSharedMemorySize               = 32768,
+      .maxComputeWorkGroupCount                 = { 65535, 65535, 65535 },
       .maxComputeWorkGroupInvocations           = 16 * devinfo->max_cs_threads,
       .maxComputeWorkGroupSize = {
          16 * devinfo->max_cs_threads,
@@ -466,16 +480,16 @@ void anv_GetPhysicalDeviceProperties(
       .maxSamplerAnisotropy                     = 16,
       .maxViewports                             = MAX_VIEWPORTS,
       .maxViewportDimensions                    = { (1 << 14), (1 << 14) },
-      .viewportBoundsRange                      = { -1.0, 1.0 }, /* FIXME */
+      .viewportBoundsRange                      = { -16384.0, 16384.0 },
       .viewportSubPixelBits                     = 13, /* We take a float? */
-      .minMemoryMapAlignment                    = 64, /* A cache line */
+      .minMemoryMapAlignment                    = 4096, /* A page */
       .minTexelBufferOffsetAlignment            = 1,
       .minUniformBufferOffsetAlignment          = 1,
       .minStorageBufferOffsetAlignment          = 1,
-      .minTexelOffset                           = 0, /* FIXME */
-      .maxTexelOffset                           = 0, /* FIXME */
-      .minTexelGatherOffset                     = 0, /* FIXME */
-      .maxTexelGatherOffset                     = 0, /* FIXME */
+      .minTexelOffset                           = -8,
+      .maxTexelOffset                           = 7,
+      .minTexelGatherOffset                     = -8,
+      .maxTexelGatherOffset                     = 7,
       .minInterpolationOffset                   = 0, /* FIXME */
       .maxInterpolationOffset                   = 0, /* FIXME */
       .subPixelInterpolationOffsetBits          = 0, /* FIXME */
@@ -493,7 +507,8 @@ void anv_GetPhysicalDeviceProperties(
       .sampledImageStencilSampleCounts          = sample_counts,
       .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
       .maxSampleMaskWords                       = 1,
-      .timestampPeriod                          = 80.0 / (1000 * 1000 * 1000),
+      .timestampComputeAndGraphics              = false,
+      .timestampPeriod                          = time_stamp_base / (1000 * 1000 * 1000),
       .maxClipDistances                         = 0 /* FIXME */,
       .maxCullDistances                         = 0 /* FIXME */,
       .maxCombinedClipAndCullDistances          = 0 /* FIXME */,
@@ -503,14 +518,14 @@ void anv_GetPhysicalDeviceProperties(
       .pointSizeGranularity                     = (1.0 / 8.0),
       .lineWidthGranularity                     = (1.0 / 128.0),
       .strictLines                              = false, /* FINISHME */
-      .standardSampleLocations                  = true, /* FINISHME */
+      .standardSampleLocations                  = true,
       .optimalBufferCopyOffsetAlignment         = 128,
       .optimalBufferCopyRowPitchAlignment       = 128,
       .nonCoherentAtomSize                      = 64,
    };
 
    *pProperties = (VkPhysicalDeviceProperties) {
-      .apiVersion = VK_MAKE_VERSION(0, 210, 1),
+      .apiVersion = VK_MAKE_VERSION(1, 0, 2),
       .driverVersion = 1,
       .vendorID = 0x8086,
       .deviceID = pdevice->chipset_id,
@@ -520,8 +535,7 @@ void anv_GetPhysicalDeviceProperties(
    };
 
    strcpy(pProperties->deviceName, pdevice->name);
-   snprintf((char *)pProperties->pipelineCacheUUID, VK_UUID_SIZE,
-            "anv-%s", MESA_GIT_SHA1 + 4);
+   anv_device_get_cache_uuid(pProperties->pipelineCacheUUID);
 }
 
 void anv_GetPhysicalDeviceQueueFamilyProperties(
@@ -541,7 +555,7 @@ void anv_GetPhysicalDeviceQueueFamilyProperties(
                     VK_QUEUE_COMPUTE_BIT |
                     VK_QUEUE_TRANSFER_BIT,
       .queueCount = 1,
-      .timestampValidBits = 0, /* XXX: Real value here */
+      .timestampValidBits = 36, /* XXX: Real value here */
       .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
    };
 }
@@ -568,7 +582,7 @@ void anv_GetPhysicalDeviceMemoryProperties(
                           VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
                           VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-         .heapIndex = 1,
+         .heapIndex = 0,
       };
    } else {
       /* The spec requires that we expose a host-visible, coherent memory
@@ -581,13 +595,13 @@ void anv_GetPhysicalDeviceMemoryProperties(
          .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
                           VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-         .heapIndex = 1,
+         .heapIndex = 0,
       };
       pMemoryProperties->memoryTypes[1] = (VkMemoryType) {
          .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
                           VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                           VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-         .heapIndex = 1,
+         .heapIndex = 0,
       };
    }
 
@@ -605,6 +619,20 @@ PFN_vkVoidFunction anv_GetInstanceProcAddr(
    return anv_lookup_entrypoint(pName);
 }
 
+/* The loader wants us to expose a second GetInstanceProcAddr function
+ * to work around certain LD_PRELOAD issues seen in apps.
+ */
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName);
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName)
+{
+   return anv_GetInstanceProcAddr(instance, pName);
+}
+
 PFN_vkVoidFunction anv_GetDeviceProcAddr(
     VkDevice                                    device,
     const char*                                 pName)
@@ -641,10 +669,19 @@ anv_state_pool_emit_data(struct anv_state_pool *pool, size_t size, size_t align,
    return state;
 }
 
+struct gen8_border_color {
+   union {
+      float float32[4];
+      uint32_t uint32[4];
+   };
+   /* Pad out to 64 bytes */
+   uint32_t _pad[12];
+};
+
 static void
 anv_device_init_border_colors(struct anv_device *device)
 {
-   static const VkClearColorValue border_colors[] = {
+   static const struct gen8_border_color border_colors[] = {
       [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] =  { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
       [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] =       { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
       [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] =       { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
@@ -654,7 +691,75 @@ anv_device_init_border_colors(struct anv_device *device)
    };
 
    device->border_colors = anv_state_pool_emit_data(&device->dynamic_state_pool,
-                                                    sizeof(border_colors), 32, border_colors);
+                                                    sizeof(border_colors), 64,
+                                                    border_colors);
+}
+
+VkResult
+anv_device_submit_simple_batch(struct anv_device *device,
+                               struct anv_batch *batch)
+{
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 exec2_objects[1];
+   struct anv_bo bo;
+   VkResult result = VK_SUCCESS;
+   uint32_t size;
+   int64_t timeout;
+   int ret;
+
+   /* Kernel driver requires 8 byte aligned batch length */
+   size = align_u32(batch->next - batch->start, 8);
+   assert(size < device->batch_bo_pool.bo_size);
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(bo.map, batch->start, size);
+   if (!device->info.has_llc)
+      anv_clflush_range(bo.map, size);
+
+   exec2_objects[0].handle = bo.gem_handle;
+   exec2_objects[0].relocation_count = 0;
+   exec2_objects[0].relocs_ptr = 0;
+   exec2_objects[0].alignment = 0;
+   exec2_objects[0].offset = bo.offset;
+   exec2_objects[0].flags = 0;
+   exec2_objects[0].rsvd1 = 0;
+   exec2_objects[0].rsvd2 = 0;
+
+   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
+   execbuf.buffer_count = 1;
+   execbuf.batch_start_offset = 0;
+   execbuf.batch_len = size;
+   execbuf.cliprects_ptr = 0;
+   execbuf.num_cliprects = 0;
+   execbuf.DR1 = 0;
+   execbuf.DR4 = 0;
+
+   execbuf.flags =
+      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
+   execbuf.rsvd1 = device->context_id;
+   execbuf.rsvd2 = 0;
+
+   ret = anv_gem_execbuffer(device, &execbuf);
+   if (ret != 0) {
+      /* We don't know the real error. */
+      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
+      goto fail;
+   }
+
+   timeout = INT64_MAX;
+   ret = anv_gem_wait(device, bo.gem_handle, &timeout);
+   if (ret != 0) {
+      /* We don't know the real error. */
+      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
+      goto fail;
+   }
+
+ fail:
+   anv_bo_pool_free(&device->batch_bo_pool, &bo);
+
+   return result;
 }
 
 VkResult anv_CreateDevice(
@@ -664,11 +769,12 @@ VkResult anv_CreateDevice(
     VkDevice*                                   pDevice)
 {
    ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   VkResult result;
    struct anv_device *device;
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
 
-   for (uint32_t i = 0; i < pCreateInfo->enabledExtensionNameCount; i++) {
+   for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
       bool found = false;
       for (uint32_t j = 0; j < ARRAY_SIZE(device_extensions); j++) {
          if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
@@ -685,12 +791,13 @@ VkResult anv_CreateDevice(
 
    device = anv_alloc2(&physical_device->instance->alloc, pAllocator,
                        sizeof(*device), 8,
-                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (!device)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
    device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
    device->instance = physical_device->instance;
+   device->chipset_id = physical_device->chipset_id;
 
    if (pAllocator)
       device->alloc = *pAllocator;
@@ -699,23 +806,32 @@ VkResult anv_CreateDevice(
 
    /* XXX(chadv): Can we dup() physicalDevice->fd here? */
    device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC);
-   if (device->fd == -1)
+   if (device->fd == -1) {
+      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
       goto fail_device;
+   }
 
    device->context_id = anv_gem_create_context(device);
-   if (device->context_id == -1)
+   if (device->context_id == -1) {
+      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
       goto fail_fd;
+   }
+
+   device->info = *physical_device->info;
+   device->isl_dev = physical_device->isl_dev;
 
    pthread_mutex_init(&device->mutex, NULL);
 
    anv_bo_pool_init(&device->batch_bo_pool, device, ANV_CMD_BUFFER_BATCH_SIZE);
 
-   anv_block_pool_init(&device->dynamic_state_block_pool, device, 2048);
+   anv_block_pool_init(&device->dynamic_state_block_pool, device, 16384);
 
    anv_state_pool_init(&device->dynamic_state_pool,
                        &device->dynamic_state_block_pool);
 
-   anv_block_pool_init(&device->instruction_block_pool, device, 8192);
+   anv_block_pool_init(&device->instruction_block_pool, device, 128 * 1024);
+   anv_pipeline_cache_init(&device->default_pipeline_cache, device);
+
    anv_block_pool_init(&device->surface_state_block_pool, device, 4096);
 
    anv_state_pool_init(&device->surface_state_pool,
@@ -725,12 +841,32 @@ VkResult anv_CreateDevice(
 
    anv_block_pool_init(&device->scratch_block_pool, device, 0x10000);
 
-   device->info = *physical_device->info;
-   device->isl_dev = physical_device->isl_dev;
-
    anv_queue_init(device, &device->queue);
 
-   anv_device_init_meta(device);
+   switch (device->info.gen) {
+   case 7:
+      if (!device->info.is_haswell)
+         result = gen7_init_device_state(device);
+      else
+         result = gen75_init_device_state(device);
+      break;
+   case 8:
+      result = gen8_init_device_state(device);
+      break;
+   case 9:
+      result = gen9_init_device_state(device);
+      break;
+   default:
+      /* Shouldn't get here as we don't create physical devices for any other
+       * gens. */
+      unreachable("unhandled gen");
+   }
+   if (result != VK_SUCCESS)
+      goto fail_fd;
+
+   result = anv_device_init_meta(device);
+   if (result != VK_SUCCESS)
+      goto fail_fd;
 
    anv_device_init_border_colors(device);
 
@@ -743,7 +879,7 @@ VkResult anv_CreateDevice(
  fail_device:
    anv_free(&device->alloc, device);
 
-   return vk_error(VK_ERROR_INITIALIZATION_FAILED);
+   return result;
 }
 
 void anv_DestroyDevice(
@@ -776,6 +912,8 @@ void anv_DestroyDevice(
 
    close(device->fd);
 
+   pthread_mutex_destroy(&device->mutex);
+
    anv_free(&device->alloc, device);
 }
 
@@ -880,20 +1018,20 @@ VkResult anv_QueueSubmit(
                              "execbuf2 failed: %m");
          }
 
-         if (fence) {
-            ret = anv_gem_execbuffer(device, &fence->execbuf);
-            if (ret != 0) {
-               /* We don't know the real error. */
-               return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                                "execbuf2 failed: %m");
-            }
-         }
-
          for (uint32_t k = 0; k < cmd_buffer->execbuf2.bo_count; k++)
             cmd_buffer->execbuf2.bos[k]->offset = cmd_buffer->execbuf2.objects[k].offset;
       }
    }
 
+   if (fence) {
+      ret = anv_gem_execbuffer(device, &fence->execbuf);
+      if (ret != 0) {
+         /* We don't know the real error. */
+         return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "execbuf2 failed: %m");
+      }
+   }
+
    return VK_SUCCESS;
 }
 
@@ -909,71 +1047,16 @@ VkResult anv_DeviceWaitIdle(
     VkDevice                                    _device)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_state state;
    struct anv_batch batch;
-   struct drm_i915_gem_execbuffer2 execbuf;
-   struct drm_i915_gem_exec_object2 exec2_objects[1];
-   struct anv_bo *bo = NULL;
-   VkResult result;
-   int64_t timeout;
-   int ret;
 
-   state = anv_state_pool_alloc(&device->dynamic_state_pool, 32, 32);
-   bo = &device->dynamic_state_pool.block_pool->bo;
-   batch.start = batch.next = state.map;
-   batch.end = state.map + 32;
+   uint32_t cmds[8];
+   batch.start = batch.next = cmds;
+   batch.end = (void *) cmds + sizeof(cmds);
+
    anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END);
    anv_batch_emit(&batch, GEN7_MI_NOOP);
 
-   if (!device->info.has_llc)
-      anv_state_clflush(state);
-
-   exec2_objects[0].handle = bo->gem_handle;
-   exec2_objects[0].relocation_count = 0;
-   exec2_objects[0].relocs_ptr = 0;
-   exec2_objects[0].alignment = 0;
-   exec2_objects[0].offset = bo->offset;
-   exec2_objects[0].flags = 0;
-   exec2_objects[0].rsvd1 = 0;
-   exec2_objects[0].rsvd2 = 0;
-
-   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
-   execbuf.buffer_count = 1;
-   execbuf.batch_start_offset = state.offset;
-   execbuf.batch_len = batch.next - state.map;
-   execbuf.cliprects_ptr = 0;
-   execbuf.num_cliprects = 0;
-   execbuf.DR1 = 0;
-   execbuf.DR4 = 0;
-
-   execbuf.flags =
-      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
-   execbuf.rsvd1 = device->context_id;
-   execbuf.rsvd2 = 0;
-
-   ret = anv_gem_execbuffer(device, &execbuf);
-   if (ret != 0) {
-      /* We don't know the real error. */
-      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
-      goto fail;
-   }
-
-   timeout = INT64_MAX;
-   ret = anv_gem_wait(device, bo->gem_handle, &timeout);
-   if (ret != 0) {
-      /* We don't know the real error. */
-      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
-      goto fail;
-   }
-
-   anv_state_pool_free(&device->dynamic_state_pool, state);
-
-   return VK_SUCCESS;
-
- fail:
-   anv_state_pool_free(&device->dynamic_state_pool, state);
-
-   return result;
+   return anv_device_submit_simple_batch(device, &batch);
 }
 
 VkResult
@@ -1020,7 +1103,10 @@ VkResult anv_AllocateMemory(
    if (mem == NULL)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   result = anv_bo_init_new(&mem->bo, device, pAllocateInfo->allocationSize);
+   /* The kernel is going to give us whole pages anyway */
+   uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
+
+   result = anv_bo_init_new(&mem->bo, device, alloc_size);
    if (result != VK_SUCCESS)
       goto fail;
 
@@ -1072,6 +1158,9 @@ VkResult anv_MapMemory(
       return VK_SUCCESS;
    }
 
+   if (size == VK_WHOLE_SIZE)
+      size = mem->bo.size - offset;
+
    /* FIXME: Is this supposed to be thread safe? Since vkUnmapMemory() only
     * takes a VkDeviceMemory pointer, it seems like only one map of the memory
     * at a time is valid. We could just mmap up front and return an offset
@@ -1082,10 +1171,19 @@ VkResult anv_MapMemory(
    if (!device->info.has_llc && mem->type_index == 0)
       gem_flags |= I915_MMAP_WC;
 
-   mem->map = anv_gem_mmap(device, mem->bo.gem_handle, offset, size, gem_flags);
-   mem->map_size = size;
+   /* GEM will fail to map if the offset isn't 4k-aligned.  Round down. */
+   uint64_t map_offset = offset & ~4095ull;
+   assert(offset >= map_offset);
+   uint64_t map_size = (offset + size) - map_offset;
 
-   *ppData = mem->map;
+   /* Let's map whole pages */
+   map_size = align_u64(map_size, 4096);
+
+   mem->map = anv_gem_mmap(device, mem->bo.gem_handle,
+                           map_offset, map_size, gem_flags);
+   mem->map_size = map_size;
+
+   *ppData = mem->map + (offset - map_offset);
 
    return VK_SUCCESS;
 }
@@ -1110,7 +1208,12 @@ clflush_mapped_ranges(struct anv_device         *device,
    for (uint32_t i = 0; i < count; i++) {
       ANV_FROM_HANDLE(anv_device_memory, mem, ranges[i].memory);
       void *p = mem->map + (ranges[i].offset & ~CACHELINE_MASK);
-      void *end = mem->map + ranges[i].offset + ranges[i].size;
+      void *end;
+
+      if (ranges[i].offset + ranges[i].size > mem->map_size)
+         end = mem->map + mem->map_size;
+      else
+         end = mem->map + ranges[i].offset + ranges[i].size;
 
       while (p < end) {
          __builtin_ia32_clflush(p);
@@ -1130,7 +1233,7 @@ VkResult anv_FlushMappedMemoryRanges(
       return VK_SUCCESS;
 
    /* Make sure the writes we're flushing have landed. */
-   __builtin_ia32_sfence();
+   __builtin_ia32_mfence();
 
    clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
 
@@ -1150,7 +1253,7 @@ VkResult anv_InvalidateMappedMemoryRanges(
    clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
 
    /* Make sure no reads get moved up above the invalidate. */
-   __builtin_ia32_lfence();
+   __builtin_ia32_mfence();
 
    return VK_SUCCESS;
 }
@@ -1299,7 +1402,7 @@ VkResult anv_CreateFence(
    if (!device->info.has_llc) {
       assert(((uintptr_t) fence->bo.map & CACHELINE_MASK) == 0);
       assert(batch.next - fence->bo.map <= CACHELINE_SIZE);
-      __builtin_ia32_sfence();
+      __builtin_ia32_mfence();
       __builtin_ia32_clflush(fence->bo.map);
    }
 
@@ -1326,6 +1429,8 @@ VkResult anv_CreateFence(
    fence->execbuf.rsvd1 = device->context_id;
    fence->execbuf.rsvd2 = 0;
 
+   fence->ready = false;
+
    *pFence = anv_fence_to_handle(fence);
 
    return VK_SUCCESS;
@@ -1429,8 +1534,13 @@ VkResult anv_CreateSemaphore(
     const VkAllocationCallbacks*                pAllocator,
     VkSemaphore*                                pSemaphore)
 {
+   /* The DRM execbuffer ioctl always execute in-oder, even between different
+    * rings. As such, there's nothing to do for the user space semaphore.
+    */
+
    *pSemaphore = (VkSemaphore)1;
-   stub_return(VK_SUCCESS);
+
+   return VK_SUCCESS;
 }
 
 void anv_DestroySemaphore(
@@ -1438,7 +1548,6 @@ void anv_DestroySemaphore(
     VkSemaphore                                 semaphore,
     const VkAllocationCallbacks*                pAllocator)
 {
-   stub();
 }
 
 // Event functions
@@ -1456,14 +1565,14 @@ VkResult anv_CreateEvent(
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_EVENT_CREATE_INFO);
 
    state = anv_state_pool_alloc(&device->dynamic_state_pool,
-                                sizeof(*event), 4);
+                                sizeof(*event), 8);
    event = state.map;
    event->state = state;
    event->semaphore = VK_EVENT_RESET;
 
    if (!device->info.has_llc) {
       /* Make sure the writes we're flushing have landed. */
-      __builtin_ia32_sfence();
+      __builtin_ia32_mfence();
       __builtin_ia32_clflush(event);
    }
 
@@ -1491,9 +1600,10 @@ VkResult anv_GetEventStatus(
    ANV_FROM_HANDLE(anv_event, event, _event);
 
    if (!device->info.has_llc) {
-      /* Make sure the writes we're flushing have landed. */
+      /* Invalidate read cache before reading event written by GPU. */
       __builtin_ia32_clflush(event);
-      __builtin_ia32_lfence();
+      __builtin_ia32_mfence();
+
    }
 
    return event->semaphore;
@@ -1510,7 +1620,7 @@ VkResult anv_SetEvent(
 
    if (!device->info.has_llc) {
       /* Make sure the writes we're flushing have landed. */
-      __builtin_ia32_sfence();
+      __builtin_ia32_mfence();
       __builtin_ia32_clflush(event);
    }
 
@@ -1528,7 +1638,7 @@ VkResult anv_ResetEvent(
 
    if (!device->info.has_llc) {
       /* Make sure the writes we're flushing have landed. */
-      __builtin_ia32_sfence();
+      __builtin_ia32_mfence();
       __builtin_ia32_clflush(event);
    }
 
@@ -1575,26 +1685,31 @@ void anv_DestroyBuffer(
 }
 
 void
-anv_fill_buffer_surface_state(struct anv_device *device, void *state,
+anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state,
                               enum isl_format format,
                               uint32_t offset, uint32_t range, uint32_t stride)
 {
    switch (device->info.gen) {
    case 7:
       if (device->info.is_haswell)
-         gen75_fill_buffer_surface_state(state, format, offset, range, stride);
+         gen75_fill_buffer_surface_state(state.map, format, offset, range,
+                                         stride);
       else
-         gen7_fill_buffer_surface_state(state, format, offset, range, stride);
+         gen7_fill_buffer_surface_state(state.map, format, offset, range,
+                                        stride);
       break;
    case 8:
-      gen8_fill_buffer_surface_state(state, format, offset, range, stride);
+      gen8_fill_buffer_surface_state(state.map, format, offset, range, stride);
       break;
    case 9:
-      gen9_fill_buffer_surface_state(state, format, offset, range, stride);
+      gen9_fill_buffer_surface_state(state.map, format, offset, range, stride);
       break;
    default:
       unreachable("unsupported gen\n");
    }
+
+   if (!device->info.has_llc)
+      anv_state_clflush(state);
 }
 
 void anv_DestroySampler(