i965/gen9: Optimize slice and subslice load balancing behavior.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program_binary.c
index 1fe3ffd5bf96257401e103144268632725782268..bf875341e21e301533e70f932ab9fcd341b20d44 100644 (file)
@@ -126,6 +126,16 @@ driver_blob_is_ready(void *blob, uint32_t size, bool with_gen_program)
    }
 }
 
+static void
+serialize_nir_part(struct blob *writer, struct gl_program *prog)
+{
+   blob_write_uint32(writer, NIR_PART);
+   intptr_t size_offset = blob_reserve_uint32(writer);
+   size_t nir_start = writer->size;
+   nir_serialize(writer, prog->nir);
+   blob_overwrite_uint32(writer, size_offset, writer->size - nir_start);
+}
+
 void
 brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
 {
@@ -138,11 +148,7 @@ brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
 
    struct blob writer;
    blob_init(&writer);
-   blob_write_uint32(&writer, NIR_PART);
-   intptr_t size_offset = blob_reserve_uint32(&writer);
-   size_t nir_start = writer.size;
-   nir_serialize(&writer, prog->nir);
-   blob_overwrite_uint32(&writer, size_offset, writer.size - nir_start);
+   serialize_nir_part(&writer, prog);
    blob_write_uint32(&writer, END_PART);
    prog->driver_cache_blob = ralloc_size(NULL, writer.size);
    memcpy(prog->driver_cache_blob, writer.data, writer.size);
@@ -158,7 +164,7 @@ deserialize_gen_program(struct blob_reader *reader, struct gl_context *ctx,
 
    union brw_any_prog_key prog_key;
    blob_copy_bytes(reader, &prog_key, brw_prog_key_size(stage));
-   brw_prog_key_set_id(&prog_key, stage, brw_program(prog)->id);
+   prog_key.base.program_string_id = brw_program(prog)->id;
 
    enum brw_cache_id cache_id = brw_stage_cache_id(stage);
 
@@ -200,14 +206,14 @@ brw_program_deserialize_driver_blob(struct gl_context *ctx,
          break;
       switch ((enum driver_cache_blob_part)part_type) {
       case GEN_PART: {
-         uint32_t gen_size = blob_read_uint32(&reader);
+         ASSERTED uint32_t gen_size = blob_read_uint32(&reader);
          assert(!reader.overrun &&
                 (uintptr_t)(reader.end - reader.current) > gen_size);
          deserialize_gen_program(&reader, ctx, prog, stage);
          break;
       }
       case NIR_PART: {
-         uint32_t nir_size = blob_read_uint32(&reader);
+         ASSERTED uint32_t nir_size = blob_read_uint32(&reader);
          assert(!reader.overrun &&
                 (uintptr_t)(reader.end - reader.current) > nir_size);
          const struct nir_shader_compiler_options *options =
@@ -237,12 +243,66 @@ brw_deserialize_program_binary(struct gl_context *ctx,
    brw_program_deserialize_driver_blob(ctx, prog, prog->info.stage);
 }
 
+static void
+serialize_gen_part(struct blob *writer, struct gl_context *ctx,
+                   struct gl_shader_program *sh_prog,
+                   struct gl_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   union brw_any_prog_key key;
+   brw_populate_default_key(brw->screen->compiler, &key, sh_prog, prog);
+
+   const gl_shader_stage stage = prog->info.stage;
+   uint32_t offset = 0;
+   void *prog_data = NULL;
+   if (brw_search_cache(&brw->cache, brw_stage_cache_id(stage), &key,
+                        brw_prog_key_size(stage), &offset, &prog_data,
+                        false)) {
+      const void *program_map = brw->cache.map + offset;
+      /* TODO: Improve perf for non-LLC. It would be best to save it at
+       * program generation time when the program is in normal memory
+       * accessible with cache to the CPU. Another easier change would be to
+       * use _mesa_streaming_load_memcpy to read from the program mapped
+       * memory.
+       */
+      blob_write_uint32(writer, GEN_PART);
+      intptr_t size_offset = blob_reserve_uint32(writer);
+      size_t gen_start = writer->size;
+      blob_write_bytes(writer, &key, brw_prog_key_size(stage));
+      brw_write_blob_program_data(writer, stage, program_map, prog_data);
+      blob_overwrite_uint32(writer, size_offset, writer->size - gen_start);
+   }
+}
+
 void
 brw_serialize_program_binary(struct gl_context *ctx,
                              struct gl_shader_program *sh_prog,
                              struct gl_program *prog)
 {
-   brw_program_serialize_nir(ctx, prog);
+   if (driver_blob_is_ready(prog->driver_cache_blob,
+                            prog->driver_cache_blob_size, true))
+      return;
+
+   if (prog->driver_cache_blob) {
+      if (!prog->nir) {
+         /* If we loaded from the disk shader cache, then the nir might not
+          * have been deserialized yet.
+          */
+         brw_program_deserialize_driver_blob(ctx, prog, prog->info.stage);
+      }
+      ralloc_free(prog->driver_cache_blob);
+   }
+
+   struct blob writer;
+   blob_init(&writer);
+   serialize_nir_part(&writer, prog);
+   serialize_gen_part(&writer, ctx, sh_prog, prog);
+   blob_write_uint32(&writer, END_PART);
+   prog->driver_cache_blob = ralloc_size(NULL, writer.size);
+   memcpy(prog->driver_cache_blob, writer.data, writer.size);
+   prog->driver_cache_blob_size = writer.size;
+   blob_finish(&writer);
 }
 
 void