From 998511a6100212931d039e3a46403d2d878c8e5a Mon Sep 17 00:00:00 2001
From: Thomas Koenig <tkoenig@gcc.gnu.org>
Date: Tue, 18 Sep 2018 20:18:09 +0000
Subject: [PATCH] re PR fortran/29550 (Optimize -fexternal-blas calls for
 conjg())

2018-09-18  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/29550
	* gfortran.h (gfc_expr): Add external_blas flag.
	* frontend-passes.c (matrix_case): Add case A2TB2T.
	(optimize_namespace): Handle flag_external_blas by
	calling call_external_blas.
	(get_array_inq_function): Add argument okind. If
	it is nonzero, use it as the kind of argument
	to be used.
	(inline_limit_check): Remove m_case argument, add
	limit argument instead.  Remove assert about m_case.
	Set the limit for inlining from the limit argument.
	(matmul_lhs_realloc): Handle case A2TB2T.
	(inline_matmul_assign): Handle inline limit for other cases with
	two rank-two matrices.  Remove no-op calls to inline_limit_check.
	(call_external_blas): New function.
	* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Do not add
	argument to external BLAS if external_blas is already set.

2018-09-18  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/29550
	* gfortran.dg/inline_matmul_13.f90: Adjust count for
	_gfortran_matmul.
	* gfortran.dg/inline_matmul_16.f90: Likewise.
	* gfortran.dg/promotion_2.f90: Add -fblas-matmul-limit=1.  Scan
	for dgemm instead of dgemm_.  Add call to random_number to make
	standard conforming.
	* gfortran.dg/matmul_blas_1.f90: New test.
	* gfortran.dg/matmul_bounds_14.f: New test.
	* gfortran.dg/matmul_bounds_15.f: New test.
	* gfortran.dg/matmul_bounds_16.f: New test.
	* gfortran.dg/blas_gemm_routines.f: New test / additional file for
	preceding tests.

From-SVN: r264412
---
 gcc/fortran/frontend-passes.c                 |  456 +++-
 gcc/fortran/gfortran.h                        |    5 +
 gcc/fortran/trans-intrinsic.c                 |    1 +
 .../gfortran.dg/blas_gemm_routines.f          | 1955 +++++++++++++++++
 .../gfortran.dg/inline_matmul_13.f90          |    2 +-
 .../gfortran.dg/inline_matmul_16.f90          |    2 +-
 gcc/testsuite/gfortran.dg/matmul_blas_1.f     |  240 ++
 gcc/testsuite/gfortran.dg/matmul_bounds_14.f  |   16 +
 gcc/testsuite/gfortran.dg/matmul_bounds_15.f  |   19 +
 gcc/testsuite/gfortran.dg/matmul_bounds_16.f  |   20 +
 gcc/testsuite/gfortran.dg/promotion_2.f90     |    7 +-
 11 files changed, 2701 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/blas_gemm_routines.f
 create mode 100644 gcc/testsuite/gfortran.dg/matmul_blas_1.f
 create mode 100644 gcc/testsuite/gfortran.dg/matmul_bounds_14.f
 create mode 100644 gcc/testsuite/gfortran.dg/matmul_bounds_15.f
 create mode 100644 gcc/testsuite/gfortran.dg/matmul_bounds_16.f

diff --git a/gcc/fortran/frontend-passes.c b/gcc/fortran/frontend-passes.c
index 80a65fc9a21..2a65b52fad7 100644
--- a/gcc/fortran/frontend-passes.c
+++ b/gcc/fortran/frontend-passes.c
@@ -53,6 +53,7 @@ static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *,
 				  char *vname=NULL);
 static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *,
 						 bool *);
+static int call_external_blas (gfc_code **, int *, void *);
 static bool has_dimen_vector_ref (gfc_expr *);
 static int matmul_temp_args (gfc_code **, int *,void *data);
 static int index_interchange (gfc_code **, int*, void *);
@@ -131,7 +132,7 @@ static int var_num = 1;
 
 /* What sort of matrix we are dealing with when inlining MATMUL.  */
 
-enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2 };
+enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2, A2TB2T };
 
 /* Keep track of the number of expressions we have inserted so far
    using create_var.  */
@@ -1428,7 +1429,7 @@ optimize_namespace (gfc_namespace *ns)
       gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
       gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
       gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
-      if (flag_inline_matmul_limit != 0)
+      if (flag_inline_matmul_limit != 0 || flag_external_blas)
 	{
 	  bool found;
 	  do
@@ -1441,9 +1442,15 @@ optimize_namespace (gfc_namespace *ns)
 
 	  gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback,
 			   NULL);
-	  gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
-			   NULL);
 	}
+
+      if (flag_external_blas)
+	gfc_code_walker (&ns->code, call_external_blas, dummy_expr_callback,
+			 NULL);
+
+      if (flag_inline_matmul_limit != 0)
+	gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
+			 NULL);
     }
 
   if (flag_frontend_loop_interchange)
@@ -2938,7 +2945,7 @@ matmul_temp_args (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
    dim is zero-based.  */
 
 static gfc_expr *
-get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
+get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim, int okind = 0)
 {
   gfc_expr *fcn;
   gfc_expr *dim_arg, *kind;
@@ -2964,8 +2971,12 @@ get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
     }
 
   dim_arg =  gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim);
-  kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
-			   gfc_index_integer_kind);
+  if (okind != 0)
+    kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
+			     okind);
+  else
+    kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
+			     gfc_index_integer_kind);
 
   ec = gfc_copy_expr (e);
 
@@ -3026,7 +3037,7 @@ get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
    removed by DCE. Only called for rank-two matrices A and B.  */
 
 static gfc_code *
-inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
+inline_limit_check (gfc_expr *a, gfc_expr *b, int limit)
 {
   gfc_expr *inline_limit;
   gfc_code *if_1, *if_2, *else_2;
@@ -3034,14 +3045,11 @@ inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
   gfc_typespec ts;
   gfc_expr *cond;
 
-  gcc_assert (m_case == A2B2 || m_case == A2B2T || m_case == A2TB2);
-
   /* Calculation is done in real to avoid integer overflow.  */
 
   inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind,
 					&a->where);
-  mpfr_set_si (inline_limit->value.real, flag_inline_matmul_limit,
-	       GFC_RND_MODE);
+  mpfr_set_si (inline_limit->value.real, limit, GFC_RND_MODE);
   mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3,
 	       GFC_RND_MODE);
 
@@ -3235,6 +3243,22 @@ matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b,
 				 get_array_inq_function (GFC_ISYM_SIZE, b, 2));
       break;
 
+    case A2TB2T:
+      /* This can only happen for BLAS, we do not handle that case in
+	 inline mamtul.  */
+      ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 2);
+      ar->start[1] = get_array_inq_function (GFC_ISYM_SIZE, b, 1);
+
+      ne1 = build_logical_expr (INTRINSIC_NE,
+				get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+				get_array_inq_function (GFC_ISYM_SIZE, a, 2));
+      ne2 = build_logical_expr (INTRINSIC_NE,
+				get_array_inq_function (GFC_ISYM_SIZE, c, 2),
+				get_array_inq_function (GFC_ISYM_SIZE, b, 1));
+
+      cond = build_logical_expr (INTRINSIC_OR, ne1, ne2);
+      break;
+
     default:
       gcc_unreachable();
 
@@ -3946,9 +3970,11 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
   /* Take care of the inline flag.  If the limit check evaluates to a
      constant, dead code elimination will eliminate the unneeded branch.  */
 
-  if (m_case == A2B2 && flag_inline_matmul_limit > 0)
+  if (flag_inline_matmul_limit > 0 && matrix_a->rank == 2
+      && matrix_b->rank == 2)
     {
-      if_limit = inline_limit_check (matrix_a, matrix_b, m_case);
+      if_limit = inline_limit_check (matrix_a, matrix_b,
+				     flag_inline_matmul_limit);
 
       /* Insert the original statement into the else branch.  */
       if_limit->block->block->next = co;
@@ -4118,7 +4144,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
   switch (m_case)
     {
     case A2B2:
-      inline_limit_check (matrix_a, matrix_b, m_case);
 
       u1 = get_size_m1 (matrix_b, 2);
       u2 = get_size_m1 (matrix_a, 2);
@@ -4151,7 +4176,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
       break;
 
     case A2B2T:
-      inline_limit_check (matrix_a, matrix_b, m_case);
 
       u1 = get_size_m1 (matrix_b, 1);
       u2 = get_size_m1 (matrix_a, 2);
@@ -4184,7 +4208,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
       break;
 
     case A2TB2:
-      inline_limit_check (matrix_a, matrix_b, m_case);
 
       u1 = get_size_m1 (matrix_a, 2);
       u2 = get_size_m1 (matrix_b, 2);
@@ -4311,6 +4334,405 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
   return 0;
 }
 
+/* Change matmul function calls in the form of
+
+   c = matmul(a,b)
+
+   to the corresponding call to a BLAS routine, if applicable.  */
+
+static int
+call_external_blas (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
+		    void *data ATTRIBUTE_UNUSED)
+{
+  gfc_code *co, *co_next;
+  gfc_expr *expr1, *expr2;
+  gfc_expr *matrix_a, *matrix_b;
+  gfc_code *if_limit = NULL;
+  gfc_actual_arglist *a, *b;
+  bool conjg_a, conjg_b, transpose_a, transpose_b;
+  gfc_code *call;
+  const char *blas_name;
+  const char *transa, *transb;
+  gfc_expr *c1, *c2, *b1;
+  gfc_actual_arglist *actual, *next;
+  bt type;
+  int kind;
+  enum matrix_case m_case;
+  bool realloc_c;
+  gfc_code **next_code_point;
+
+  /* Many of the tests for inline matmul also apply here.  */
+
+  co = *c;
+
+  if (co->op != EXEC_ASSIGN)
+    return 0;
+
+  if (in_where || in_assoc_list)
+    return 0;
+
+  /* The BLOCKS generated for the temporary variables and FORALL don't
+     mix.  */
+  if (forall_level > 0)
+    return 0;
+
+  /* For now don't do anything in OpenMP workshare, it confuses
+     its translation, which expects only the allowed statements in there. */
+
+  if (in_omp_workshare)
+    return 0;
+
+  expr1 = co->expr1;
+  expr2 = co->expr2;
+  if (expr2->expr_type != EXPR_FUNCTION
+      || expr2->value.function.isym == NULL
+      || expr2->value.function.isym->id != GFC_ISYM_MATMUL)
+    return 0;
+
+  type = expr2->ts.type;
+  kind = expr2->ts.kind;
+
+  /* Guard against recursion. */
+
+  if (expr2->external_blas)
+    return 0;
+
+  if (type != expr1->ts.type || kind != expr1->ts.kind)
+    return 0;
+
+  if (type == BT_REAL)
+    {
+      if (kind == 4)
+	blas_name = "sgemm";
+      else if (kind == 8)
+	blas_name = "dgemm";
+      else
+	return 0;
+    }
+  else if (type == BT_COMPLEX)
+    {
+      if (kind == 4)
+	blas_name = "cgemm";
+      else if (kind == 8)
+	blas_name = "zgemm";
+      else
+	return 0;
+    }
+  else
+    return 0;
+
+  a = expr2->value.function.actual;
+  if (a->expr->rank != 2)
+    return 0;
+
+  b = a->next;
+  if (b->expr->rank != 2)
+    return 0;
+
+  matrix_a = check_conjg_transpose_variable (a->expr, &conjg_a, &transpose_a);
+  if (matrix_a == NULL)
+    return 0;
+
+  if (transpose_a)
+    {
+      if (conjg_a)
+	transa = "C";
+      else
+	transa = "T";
+    }
+  else
+    transa = "N";
+
+  matrix_b = check_conjg_transpose_variable (b->expr, &conjg_b, &transpose_b);
+  if (matrix_b == NULL)
+    return 0;
+
+  if (transpose_b)
+    {
+      if (conjg_b)
+	transb = "C";
+      else
+	transb = "T";
+    }
+  else
+    transb = "N";
+
+  if (transpose_a)
+    {
+      if (transpose_b)
+	m_case = A2TB2T;
+      else
+	m_case = A2TB2;
+    }
+  else
+    {
+      if (transpose_b)
+	m_case = A2B2T;
+      else
+	m_case = A2B2;
+    }
+
+  current_code = c;
+  inserted_block = NULL;
+  changed_statement = NULL;
+
+  expr2->external_blas = 1;
+
+  /* We do not handle data dependencies yet.  */
+  if (gfc_check_dependency (expr1, matrix_a, true)
+      || gfc_check_dependency (expr1, matrix_b, true))
+    return 0;
+
+  /* Generate the if statement and hang it into the tree.  */
+  if_limit = inline_limit_check (matrix_a, matrix_b, flag_blas_matmul_limit);
+  co_next = co->next;
+  (*current_code) = if_limit;
+  co->next = NULL;
+  if_limit->block->next = co;
+
+  call = XCNEW (gfc_code);
+  call->loc = co->loc;
+
+  /* Bounds checking - a bit simpler than for inlining since we only
+     have to take care of two-dimensional arrays here.  */
+
+  realloc_c = flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1);
+  next_code_point = &(if_limit->block->block->next);
+
+  if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS)
+    {
+      gfc_code *test;
+      //      gfc_expr *a2, *b1, *c1, *c2, *a1, *b2;
+      gfc_expr *c1, *a1, *c2, *b2, *a2;
+      switch (m_case)
+	{
+	case A2B2:
+	  b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+	  a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+	  test = runtime_error_ne (b1, a2, B_ERROR(1));
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+
+	  if (!realloc_c)
+	    {
+	      c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+	      a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+	      test = runtime_error_ne (c1, a1, C_ERROR(1));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+
+	      c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+	      b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+	      test = runtime_error_ne (c2, b2, C_ERROR(2));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+	    }
+	  break;
+
+	case A2B2T:
+
+	  b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+	  a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+	  /* matrix_b is transposed, hence dimension 1 for the error message.  */
+	  test = runtime_error_ne (b2, a2, B_ERROR(1));
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+
+	  if (!realloc_c)
+	    {
+	      c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+	      a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+	      test = runtime_error_ne (c1, a1, C_ERROR(1));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+
+	      c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+	      b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+	      test = runtime_error_ne (c2, b1, C_ERROR(2));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+	    }
+	  break;
+
+	case A2TB2:
+
+	  b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+	  a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+	  test = runtime_error_ne (b1, a1, B_ERROR(1));
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+
+	  if (!realloc_c)
+	    {
+	      c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+	      a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+	      test = runtime_error_ne (c1, a2, C_ERROR(1));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+
+	      c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+	      b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+	      test = runtime_error_ne (c2, b2, C_ERROR(2));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+	    }
+	  break;
+
+	case A2TB2T:
+	  b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+	  a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+	  test = runtime_error_ne (b2, a1, B_ERROR(1));
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+
+	  if (!realloc_c)
+	    {
+	      c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+	      a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+	      test = runtime_error_ne (c1, a2, C_ERROR(1));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+
+	      c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+	      b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+	      test = runtime_error_ne (c2, b1, C_ERROR(2));
+	      *next_code_point = test;
+	      next_code_point = &test->next;
+	    }
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }    
+
+  /* Handle the reallocation, if needed.  */
+
+  if (realloc_c)
+    {
+      gfc_code *lhs_alloc;
+
+      lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case);
+      *next_code_point = lhs_alloc;
+      next_code_point = &lhs_alloc->next;
+    }
+
+  *next_code_point = call;
+  if_limit->next = co_next;
+
+  /* Set up the BLAS call.  */
+
+  call->op = EXEC_CALL;
+
+  gfc_get_sym_tree (blas_name, current_ns, &(call->symtree), true);
+  call->symtree->n.sym->attr.subroutine = 1;
+  call->symtree->n.sym->attr.procedure = 1;
+  call->symtree->n.sym->attr.flavor = FL_PROCEDURE;
+  call->resolved_sym = call->symtree->n.sym;
+
+  /* Argument TRANSA.  */
+  next = gfc_get_actual_arglist ();
+  next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
+				       transa, 1);
+
+  call->ext.actual = next;
+
+  /* Argument TRANSB.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
+				       transb, 1);
+  actual->next = next;
+
+  c1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (a->expr), 1,
+			       gfc_integer_4_kind);
+  c2 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 2,
+			       gfc_integer_4_kind);
+
+  b1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 1,
+			       gfc_integer_4_kind);
+
+  /* Argument M. */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = c1;
+  actual->next = next;
+
+  /* Argument N. */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = c2;
+  actual->next = next;
+
+  /* Argument K.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = b1;
+  actual->next = next;
+
+  /* Argument ALPHA - set to one.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = gfc_get_constant_expr (type, kind, &co->loc);
+  if (type == BT_REAL)
+    mpfr_set_ui (next->expr->value.real, 1, GFC_RND_MODE);
+  else
+    mpc_set_ui (next->expr->value.complex, 1, GFC_MPC_RND_MODE);
+  actual->next = next;
+
+  /* Argument A.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = gfc_copy_expr (matrix_a);
+  actual->next = next;
+
+  /* Argument LDA.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_a),
+				       1, gfc_integer_4_kind);
+  actual->next = next;
+
+  /* Argument B.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = gfc_copy_expr (matrix_b);
+  actual->next = next;
+
+  /* Argument LDB.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_b),
+				       1, gfc_integer_4_kind);
+  actual->next = next;
+
+  /* Argument BETA - set to zero.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = gfc_get_constant_expr (type, kind, &co->loc);
+  if (type == BT_REAL)
+    mpfr_set_ui (next->expr->value.real, 0, GFC_RND_MODE);
+  else
+    mpc_set_ui (next->expr->value.complex, 0, GFC_MPC_RND_MODE);
+  actual->next = next;
+
+  /* Argument C.  */
+
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = gfc_copy_expr (expr1);
+  actual->next = next;
+
+  /* Argument LDC.  */
+  actual = next;
+  next = gfc_get_actual_arglist ();
+  next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (expr1),
+				       1, gfc_integer_4_kind);
+  actual->next = next;
+
+  return 0;
+}
+
 
 /* Code for index interchange for loops which are grouped together in DO
    CONCURRENT or FORALL statements.  This is currently only applied if the
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 04b0024a992..3359974d108 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -2143,6 +2143,11 @@ typedef struct gfc_expr
 
   unsigned int no_bounds_check : 1;
 
+  /* Set this if a matmul expression has already been evaluated for conversion
+     to a BLAS call.  */
+
+  unsigned int external_blas : 1;
+
   /* If an expression comes from a Hollerith constant or compile-time
      evaluation of a transfer statement, it may have a prescribed target-
      memory representation, and these cannot always be backformed from
diff --git a/gcc/fortran/trans-intrinsic.c b/gcc/fortran/trans-intrinsic.c
index b2cea93742a..0c83f474797 100644
--- a/gcc/fortran/trans-intrinsic.c
+++ b/gcc/fortran/trans-intrinsic.c
@@ -4055,6 +4055,7 @@ gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
      to be able to call the BLAS ?gemm functions if required and possible.  */
   append_args = NULL;
   if (expr->value.function.isym->id == GFC_ISYM_MATMUL
+      && !expr->external_blas
       && sym->ts.type != BT_LOGICAL)
     {
       tree cint = gfc_get_int_type (gfc_c_int_kind);
diff --git a/gcc/testsuite/gfortran.dg/blas_gemm_routines.f b/gcc/testsuite/gfortran.dg/blas_gemm_routines.f
new file mode 100644
index 00000000000..f97a33b5fbc
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/blas_gemm_routines.f
@@ -0,0 +1,1955 @@
+! { dg-do compile }
+! { dg-options "-std=legacy" }
+*> \brief \b CGEMM
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*       .. Scalar Arguments ..
+*       COMPLEX ALPHA,BETA
+*       INTEGER K,LDA,LDB,LDC,M,N
+*       CHARACTER TRANSA,TRANSB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX A(LDA,*),B(LDB,*),C(LDC,*)
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGEMM  performs one of the matrix-matrix operations
+*>
+*>    C := alpha*op( A )*op( B ) + beta*C,
+*>
+*> where  op( X ) is one of
+*>
+*>    op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+*>
+*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANSA
+*> \verbatim
+*>          TRANSA is CHARACTER*1
+*>           On entry, TRANSA specifies the form of op( A ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSA = 'N' or 'n',  op( A ) = A.
+*>
+*>              TRANSA = 'T' or 't',  op( A ) = A**T.
+*>
+*>              TRANSA = 'C' or 'c',  op( A ) = A**H.
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*>          TRANSB is CHARACTER*1
+*>           On entry, TRANSB specifies the form of op( B ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSB = 'N' or 'n',  op( B ) = B.
+*>
+*>              TRANSB = 'T' or 't',  op( B ) = B**T.
+*>
+*>              TRANSB = 'C' or 'c',  op( B ) = B**H.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>           On entry,  M  specifies  the number  of rows  of the  matrix
+*>           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>           On entry,  N  specifies the number  of columns of the matrix
+*>           op( B ) and the number of columns of the matrix C. N must be
+*>           at least zero.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>           On entry,  K  specifies  the number of columns of the matrix
+*>           op( A ) and the number of rows of the matrix op( B ). K must
+*>           be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*>          ALPHA is COMPLEX
+*>           On entry, ALPHA specifies the scalar alpha.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX array, dimension ( LDA, ka ), where ka is
+*>           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*>           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*>           part of the array  A  must contain the matrix  A,  otherwise
+*>           the leading  k by m  part of the array  A  must contain  the
+*>           matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>           On entry, LDA specifies the first dimension of A as declared
+*>           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*>           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*>           least  max( 1, k ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is COMPLEX array, dimension ( LDB, kb ), where kb is
+*>           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*>           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*>           part of the array  B  must contain the matrix  B,  otherwise
+*>           the leading  n by k  part of the array  B  must contain  the
+*>           matrix B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>           On entry, LDB specifies the first dimension of B as declared
+*>           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*>           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*>           least  max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*>          BETA is COMPLEX
+*>           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*>           supplied as zero then C need not be set on input.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX array, dimension ( LDC, N )
+*>           Before entry, the leading  m by n  part of the array  C must
+*>           contain the matrix  C,  except when  beta  is zero, in which
+*>           case C need not be set on entry.
+*>           On exit, the array  C  is overwritten by the  m by n  matrix
+*>           ( alpha*op( A )*op( B ) + beta*C ).
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>           On entry, LDC specifies the first dimension of C as declared
+*>           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*>           max( 1, m ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date December 2016
+*
+*> \ingroup complex_blas_level3
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Level 3 Blas routine.
+*>
+*>  -- Written on 8-February-1989.
+*>     Jack Dongarra, Argonne National Laboratory.
+*>     Iain Duff, AERE Harwell.
+*>     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>     Sven Hammarling, Numerical Algorithms Group Ltd.
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*  -- Reference BLAS level3 routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      COMPLEX ALPHA,BETA
+      INTEGER K,LDA,LDB,LDC,M,N
+      CHARACTER TRANSA,TRANSB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX A(LDA,*),B(LDB,*),C(LDC,*)
+*     ..
+*
+*  =====================================================================
+*
+*     .. External Functions ..
+      LOGICAL LSAME
+      EXTERNAL LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC CONJG,MAX
+*     ..
+*     .. Local Scalars ..
+      COMPLEX TEMP
+      INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
+      LOGICAL CONJA,CONJB,NOTA,NOTB
+*     ..
+*     .. Parameters ..
+      COMPLEX ONE
+      PARAMETER (ONE= (1.0E+0,0.0E+0))
+      COMPLEX ZERO
+      PARAMETER (ZERO= (0.0E+0,0.0E+0))
+*     ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+*     B  respectively are to be  transposed but  not conjugated  and set
+*     NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+*     and the number of rows of  B  respectively.
+*
+      NOTA = LSAME(TRANSA,'N')
+      NOTB = LSAME(TRANSB,'N')
+      CONJA = LSAME(TRANSA,'C')
+      CONJB = LSAME(TRANSB,'C')
+      IF (NOTA) THEN
+          NROWA = M
+          NCOLA = K
+      ELSE
+          NROWA = K
+          NCOLA = M
+      END IF
+      IF (NOTB) THEN
+          NROWB = K
+      ELSE
+          NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND.
+     +    (.NOT.LSAME(TRANSA,'T'))) THEN
+          INFO = 1
+      ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND.
+     +         (.NOT.LSAME(TRANSB,'T'))) THEN
+          INFO = 2
+      ELSE IF (M.LT.0) THEN
+          INFO = 3
+      ELSE IF (N.LT.0) THEN
+          INFO = 4
+      ELSE IF (K.LT.0) THEN
+          INFO = 5
+      ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
+          INFO = 8
+      ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
+          INFO = 10
+      ELSE IF (LDC.LT.MAX(1,M)) THEN
+          INFO = 13
+      END IF
+      IF (INFO.NE.0) THEN
+          CALL XERBLA('CGEMM ',INFO)
+          RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+     +    (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF (ALPHA.EQ.ZERO) THEN
+          IF (BETA.EQ.ZERO) THEN
+              DO 20 J = 1,N
+                  DO 10 I = 1,M
+                      C(I,J) = ZERO
+   10             CONTINUE
+   20         CONTINUE
+          ELSE
+              DO 40 J = 1,N
+                  DO 30 I = 1,M
+                      C(I,J) = BETA*C(I,J)
+   30             CONTINUE
+   40         CONTINUE
+          END IF
+          RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF (NOTB) THEN
+          IF (NOTA) THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+              DO 90 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 50 I = 1,M
+                          C(I,J) = ZERO
+   50                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 60 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+   60                 CONTINUE
+                  END IF
+                  DO 80 L = 1,K
+                      TEMP = ALPHA*B(L,J)
+                      DO 70 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+   70                 CONTINUE
+   80             CONTINUE
+   90         CONTINUE
+          ELSE IF (CONJA) THEN
+*
+*           Form  C := alpha*A**H*B + beta*C.
+*
+              DO 120 J = 1,N
+                  DO 110 I = 1,M
+                      TEMP = ZERO
+                      DO 100 L = 1,K
+                          TEMP = TEMP + CONJG(A(L,I))*B(L,J)
+  100                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  110             CONTINUE
+  120         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B + beta*C
+*
+              DO 150 J = 1,N
+                  DO 140 I = 1,M
+                      TEMP = ZERO
+                      DO 130 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(L,J)
+  130                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  140             CONTINUE
+  150         CONTINUE
+          END IF
+      ELSE IF (NOTA) THEN
+          IF (CONJB) THEN
+*
+*           Form  C := alpha*A*B**H + beta*C.
+*
+              DO 200 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 160 I = 1,M
+                          C(I,J) = ZERO
+  160                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 170 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+  170                 CONTINUE
+                  END IF
+                  DO 190 L = 1,K
+                      TEMP = ALPHA*CONJG(B(J,L))
+                      DO 180 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+  180                 CONTINUE
+  190             CONTINUE
+  200         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A*B**T + beta*C
+*
+              DO 250 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 210 I = 1,M
+                          C(I,J) = ZERO
+  210                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 220 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+  220                 CONTINUE
+                  END IF
+                  DO 240 L = 1,K
+                      TEMP = ALPHA*B(J,L)
+                      DO 230 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+  230                 CONTINUE
+  240             CONTINUE
+  250         CONTINUE
+          END IF
+      ELSE IF (CONJA) THEN
+          IF (CONJB) THEN
+*
+*           Form  C := alpha*A**H*B**H + beta*C.
+*
+              DO 280 J = 1,N
+                  DO 270 I = 1,M
+                      TEMP = ZERO
+                      DO 260 L = 1,K
+                          TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L))
+  260                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  270             CONTINUE
+  280         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**H*B**T + beta*C
+*
+              DO 310 J = 1,N
+                  DO 300 I = 1,M
+                      TEMP = ZERO
+                      DO 290 L = 1,K
+                          TEMP = TEMP + CONJG(A(L,I))*B(J,L)
+  290                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  300             CONTINUE
+  310         CONTINUE
+          END IF
+      ELSE
+          IF (CONJB) THEN
+*
+*           Form  C := alpha*A**T*B**H + beta*C
+*
+              DO 340 J = 1,N
+                  DO 330 I = 1,M
+                      TEMP = ZERO
+                      DO 320 L = 1,K
+                          TEMP = TEMP + A(L,I)*CONJG(B(J,L))
+  320                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  330             CONTINUE
+  340         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B**T + beta*C
+*
+              DO 370 J = 1,N
+                  DO 360 I = 1,M
+                      TEMP = ZERO
+                      DO 350 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(J,L)
+  350                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  360             CONTINUE
+  370         CONTINUE
+          END IF
+      END IF
+*
+      RETURN
+*
+*     End of CGEMM .
+*
+      END
+
+*> \brief \b LSAME
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       LOGICAL FUNCTION LSAME(CA,CB)
+*
+*       .. Scalar Arguments ..
+*       CHARACTER CA,CB
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> LSAME returns .TRUE. if CA is the same letter as CB regardless of
+*> case.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] CA
+*> \verbatim
+*>          CA is CHARACTER*1
+*> \endverbatim
+*>
+*> \param[in] CB
+*> \verbatim
+*>          CB is CHARACTER*1
+*>          CA and CB specify the single characters to be compared.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date December 2016
+*
+*> \ingroup aux_blas
+*
+*  =====================================================================
+      LOGICAL FUNCTION LSAME(CA,CB)
+*
+*  -- Reference BLAS level1 routine (version 3.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER CA,CB
+*     ..
+*
+* =====================================================================
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC ICHAR
+*     ..
+*     .. Local Scalars ..
+      INTEGER INTA,INTB,ZCODE
+*     ..
+*
+*     Test if the characters are equal
+*
+      LSAME = CA .EQ. CB
+      IF (LSAME) RETURN
+*
+*     Now test for equivalence if both characters are alphabetic.
+*
+      ZCODE = ICHAR('Z')
+*
+*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime
+*     machines, on which ICHAR returns a value with bit 8 set.
+*     ICHAR('A') on Prime machines returns 193 which is the same as
+*     ICHAR('A') on an EBCDIC machine.
+*
+      INTA = ICHAR(CA)
+      INTB = ICHAR(CB)
+*
+      IF (ZCODE.EQ.90 .OR. ZCODE.EQ.122) THEN
+*
+*        ASCII is assumed - ZCODE is the ASCII code of either lower or
+*        upper case 'Z'.
+*
+          IF (INTA.GE.97 .AND. INTA.LE.122) INTA = INTA - 32
+          IF (INTB.GE.97 .AND. INTB.LE.122) INTB = INTB - 32
+*
+      ELSE IF (ZCODE.EQ.233 .OR. ZCODE.EQ.169) THEN
+*
+*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
+*        upper case 'Z'.
+*
+          IF (INTA.GE.129 .AND. INTA.LE.137 .OR.
+     +        INTA.GE.145 .AND. INTA.LE.153 .OR.
+     +        INTA.GE.162 .AND. INTA.LE.169) INTA = INTA + 64
+          IF (INTB.GE.129 .AND. INTB.LE.137 .OR.
+     +        INTB.GE.145 .AND. INTB.LE.153 .OR.
+     +        INTB.GE.162 .AND. INTB.LE.169) INTB = INTB + 64
+*
+      ELSE IF (ZCODE.EQ.218 .OR. ZCODE.EQ.250) THEN
+*
+*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code
+*        plus 128 of either lower or upper case 'Z'.
+*
+          IF (INTA.GE.225 .AND. INTA.LE.250) INTA = INTA - 32
+          IF (INTB.GE.225 .AND. INTB.LE.250) INTB = INTB - 32
+      END IF
+      LSAME = INTA .EQ. INTB
+*
+*     RETURN
+*
+*     End of LSAME
+*
+      END
+
+*> \brief \b XERBLA
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER*(*)      SRNAME
+*       INTEGER            INFO
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> XERBLA  is an error handler for the LAPACK routines.
+*> It is called by an LAPACK routine if an input parameter has an
+*> invalid value.  A message is printed and execution stops.
+*>
+*> Installers may consider modifying the STOP statement in order to
+*> call system-specific exception-handling facilities.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] SRNAME
+*> \verbatim
+*>          SRNAME is CHARACTER*(*)
+*>          The name of the routine which called XERBLA.
+*> \endverbatim
+*>
+*> \param[in] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          The position of the invalid parameter in the parameter list
+*>          of the calling routine.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date December 2016
+*
+*> \ingroup aux_blas
+*
+*  =====================================================================
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  -- Reference BLAS level1 routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      CHARACTER*(*)      SRNAME
+      INTEGER            INFO
+*     ..
+*
+* =====================================================================
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC          LEN_TRIM
+*     ..
+*     .. Executable Statements ..
+*
+      WRITE( *, FMT = 9999 )SRNAME( 1:LEN_TRIM( SRNAME ) ), INFO
+*
+      STOP
+*
+ 9999 FORMAT( ' ** On entry to ', A, ' parameter number ', I2, ' had ',
+     $      'an illegal value' )
+*
+*     End of XERBLA
+*
+      END
+
+*> \brief \b SGEMM
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*       .. Scalar Arguments ..
+*       REAL ALPHA,BETA
+*       INTEGER K,LDA,LDB,LDC,M,N
+*       CHARACTER TRANSA,TRANSB
+*       ..
+*       .. Array Arguments ..
+*       REAL A(LDA,*),B(LDB,*),C(LDC,*)
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGEMM  performs one of the matrix-matrix operations
+*>
+*>    C := alpha*op( A )*op( B ) + beta*C,
+*>
+*> where  op( X ) is one of
+*>
+*>    op( X ) = X   or   op( X ) = X**T,
+*>
+*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANSA
+*> \verbatim
+*>          TRANSA is CHARACTER*1
+*>           On entry, TRANSA specifies the form of op( A ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSA = 'N' or 'n',  op( A ) = A.
+*>
+*>              TRANSA = 'T' or 't',  op( A ) = A**T.
+*>
+*>              TRANSA = 'C' or 'c',  op( A ) = A**T.
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*>          TRANSB is CHARACTER*1
+*>           On entry, TRANSB specifies the form of op( B ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSB = 'N' or 'n',  op( B ) = B.
+*>
+*>              TRANSB = 'T' or 't',  op( B ) = B**T.
+*>
+*>              TRANSB = 'C' or 'c',  op( B ) = B**T.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>           On entry,  M  specifies  the number  of rows  of the  matrix
+*>           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>           On entry,  N  specifies the number  of columns of the matrix
+*>           op( B ) and the number of columns of the matrix C. N must be
+*>           at least zero.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>           On entry,  K  specifies  the number of columns of the matrix
+*>           op( A ) and the number of rows of the matrix op( B ). K must
+*>           be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*>          ALPHA is REAL
+*>           On entry, ALPHA specifies the scalar alpha.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is REAL array, dimension ( LDA, ka ), where ka is
+*>           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*>           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*>           part of the array  A  must contain the matrix  A,  otherwise
+*>           the leading  k by m  part of the array  A  must contain  the
+*>           matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>           On entry, LDA specifies the first dimension of A as declared
+*>           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*>           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*>           least  max( 1, k ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is REAL array, dimension ( LDB, kb ), where kb is
+*>           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*>           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*>           part of the array  B  must contain the matrix  B,  otherwise
+*>           the leading  n by k  part of the array  B  must contain  the
+*>           matrix B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>           On entry, LDB specifies the first dimension of B as declared
+*>           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*>           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*>           least  max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*>          BETA is REAL
+*>           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*>           supplied as zero then C need not be set on input.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension ( LDC, N )
+*>           Before entry, the leading  m by n  part of the array  C must
+*>           contain the matrix  C,  except when  beta  is zero, in which
+*>           case C need not be set on entry.
+*>           On exit, the array  C  is overwritten by the  m by n  matrix
+*>           ( alpha*op( A )*op( B ) + beta*C ).
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>           On entry, LDC specifies the first dimension of C as declared
+*>           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*>           max( 1, m ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date December 2016
+*
+*> \ingroup single_blas_level3
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Level 3 Blas routine.
+*>
+*>  -- Written on 8-February-1989.
+*>     Jack Dongarra, Argonne National Laboratory.
+*>     Iain Duff, AERE Harwell.
+*>     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>     Sven Hammarling, Numerical Algorithms Group Ltd.
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*  -- Reference BLAS level3 routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      REAL ALPHA,BETA
+      INTEGER K,LDA,LDB,LDC,M,N
+      CHARACTER TRANSA,TRANSB
+*     ..
+*     .. Array Arguments ..
+      REAL A(LDA,*),B(LDB,*),C(LDC,*)
+*     ..
+*
+*  =====================================================================
+*
+*     .. External Functions ..
+      LOGICAL LSAME
+      EXTERNAL LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC MAX
+*     ..
+*     .. Local Scalars ..
+      REAL TEMP
+      INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
+      LOGICAL NOTA,NOTB
+*     ..
+*     .. Parameters ..
+      REAL ONE,ZERO
+      PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
+*     ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+*     and  columns of  A  and the  number of  rows  of  B  respectively.
+*
+      NOTA = LSAME(TRANSA,'N')
+      NOTB = LSAME(TRANSB,'N')
+      IF (NOTA) THEN
+          NROWA = M
+          NCOLA = K
+      ELSE
+          NROWA = K
+          NCOLA = M
+      END IF
+      IF (NOTB) THEN
+          NROWB = K
+      ELSE
+          NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND.
+     +    (.NOT.LSAME(TRANSA,'T'))) THEN
+          INFO = 1
+      ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND.
+     +         (.NOT.LSAME(TRANSB,'T'))) THEN
+          INFO = 2
+      ELSE IF (M.LT.0) THEN
+          INFO = 3
+      ELSE IF (N.LT.0) THEN
+          INFO = 4
+      ELSE IF (K.LT.0) THEN
+          INFO = 5
+      ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
+          INFO = 8
+      ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
+          INFO = 10
+      ELSE IF (LDC.LT.MAX(1,M)) THEN
+          INFO = 13
+      END IF
+      IF (INFO.NE.0) THEN
+          CALL XERBLA('SGEMM ',INFO)
+          RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+     +    (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
+*
+*     And if  alpha.eq.zero.
+*
+      IF (ALPHA.EQ.ZERO) THEN
+          IF (BETA.EQ.ZERO) THEN
+              DO 20 J = 1,N
+                  DO 10 I = 1,M
+                      C(I,J) = ZERO
+   10             CONTINUE
+   20         CONTINUE
+          ELSE
+              DO 40 J = 1,N
+                  DO 30 I = 1,M
+                      C(I,J) = BETA*C(I,J)
+   30             CONTINUE
+   40         CONTINUE
+          END IF
+          RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF (NOTB) THEN
+          IF (NOTA) THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+              DO 90 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 50 I = 1,M
+                          C(I,J) = ZERO
+   50                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 60 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+   60                 CONTINUE
+                  END IF
+                  DO 80 L = 1,K
+                      TEMP = ALPHA*B(L,J)
+                      DO 70 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+   70                 CONTINUE
+   80             CONTINUE
+   90         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B + beta*C
+*
+              DO 120 J = 1,N
+                  DO 110 I = 1,M
+                      TEMP = ZERO
+                      DO 100 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(L,J)
+  100                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  110             CONTINUE
+  120         CONTINUE
+          END IF
+      ELSE
+          IF (NOTA) THEN
+*
+*           Form  C := alpha*A*B**T + beta*C
+*
+              DO 170 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 130 I = 1,M
+                          C(I,J) = ZERO
+  130                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 140 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+  140                 CONTINUE
+                  END IF
+                  DO 160 L = 1,K
+                      TEMP = ALPHA*B(J,L)
+                      DO 150 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+  150                 CONTINUE
+  160             CONTINUE
+  170         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B**T + beta*C
+*
+              DO 200 J = 1,N
+                  DO 190 I = 1,M
+                      TEMP = ZERO
+                      DO 180 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(J,L)
+  180                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  190             CONTINUE
+  200         CONTINUE
+          END IF
+      END IF
+*
+      RETURN
+*
+*     End of SGEMM .
+*
+      END
+
+*> \brief \b DGEMM
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*       .. Scalar Arguments ..
+*       DOUBLE PRECISION ALPHA,BETA
+*       INTEGER K,LDA,LDB,LDC,M,N
+*       CHARACTER TRANSA,TRANSB
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*)
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGEMM  performs one of the matrix-matrix operations
+*>
+*>    C := alpha*op( A )*op( B ) + beta*C,
+*>
+*> where  op( X ) is one of
+*>
+*>    op( X ) = X   or   op( X ) = X**T,
+*>
+*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANSA
+*> \verbatim
+*>          TRANSA is CHARACTER*1
+*>           On entry, TRANSA specifies the form of op( A ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSA = 'N' or 'n',  op( A ) = A.
+*>
+*>              TRANSA = 'T' or 't',  op( A ) = A**T.
+*>
+*>              TRANSA = 'C' or 'c',  op( A ) = A**T.
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*>          TRANSB is CHARACTER*1
+*>           On entry, TRANSB specifies the form of op( B ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSB = 'N' or 'n',  op( B ) = B.
+*>
+*>              TRANSB = 'T' or 't',  op( B ) = B**T.
+*>
+*>              TRANSB = 'C' or 'c',  op( B ) = B**T.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>           On entry,  M  specifies  the number  of rows  of the  matrix
+*>           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>           On entry,  N  specifies the number  of columns of the matrix
+*>           op( B ) and the number of columns of the matrix C. N must be
+*>           at least zero.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>           On entry,  K  specifies  the number of columns of the matrix
+*>           op( A ) and the number of rows of the matrix op( B ). K must
+*>           be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*>          ALPHA is DOUBLE PRECISION.
+*>           On entry, ALPHA specifies the scalar alpha.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension ( LDA, ka ), where ka is
+*>           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*>           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*>           part of the array  A  must contain the matrix  A,  otherwise
+*>           the leading  k by m  part of the array  A  must contain  the
+*>           matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>           On entry, LDA specifies the first dimension of A as declared
+*>           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*>           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*>           least  max( 1, k ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension ( LDB, kb ), where kb is
+*>           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*>           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*>           part of the array  B  must contain the matrix  B,  otherwise
+*>           the leading  n by k  part of the array  B  must contain  the
+*>           matrix B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>           On entry, LDB specifies the first dimension of B as declared
+*>           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*>           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*>           least  max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*>          BETA is DOUBLE PRECISION.
+*>           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*>           supplied as zero then C need not be set on input.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension ( LDC, N )
+*>           Before entry, the leading  m by n  part of the array  C must
+*>           contain the matrix  C,  except when  beta  is zero, in which
+*>           case C need not be set on entry.
+*>           On exit, the array  C  is overwritten by the  m by n  matrix
+*>           ( alpha*op( A )*op( B ) + beta*C ).
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>           On entry, LDC specifies the first dimension of C as declared
+*>           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*>           max( 1, m ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date December 2016
+*
+*> \ingroup double_blas_level3
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Level 3 Blas routine.
+*>
+*>  -- Written on 8-February-1989.
+*>     Jack Dongarra, Argonne National Laboratory.
+*>     Iain Duff, AERE Harwell.
+*>     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>     Sven Hammarling, Numerical Algorithms Group Ltd.
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*  -- Reference BLAS level3 routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION ALPHA,BETA
+      INTEGER K,LDA,LDB,LDC,M,N
+      CHARACTER TRANSA,TRANSB
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*)
+*     ..
+*
+*  =====================================================================
+*
+*     .. External Functions ..
+      LOGICAL LSAME
+      EXTERNAL LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC MAX
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION TEMP
+      INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
+      LOGICAL NOTA,NOTB
+*     ..
+*     .. Parameters ..
+      DOUBLE PRECISION ONE,ZERO
+      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
+*     ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+*     and  columns of  A  and the  number of  rows  of  B  respectively.
+*
+      NOTA = LSAME(TRANSA,'N')
+      NOTB = LSAME(TRANSB,'N')
+      IF (NOTA) THEN
+          NROWA = M
+          NCOLA = K
+      ELSE
+          NROWA = K
+          NCOLA = M
+      END IF
+      IF (NOTB) THEN
+          NROWB = K
+      ELSE
+          NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND.
+     +    (.NOT.LSAME(TRANSA,'T'))) THEN
+          INFO = 1
+      ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND.
+     +         (.NOT.LSAME(TRANSB,'T'))) THEN
+          INFO = 2
+      ELSE IF (M.LT.0) THEN
+          INFO = 3
+      ELSE IF (N.LT.0) THEN
+          INFO = 4
+      ELSE IF (K.LT.0) THEN
+          INFO = 5
+      ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
+          INFO = 8
+      ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
+          INFO = 10
+      ELSE IF (LDC.LT.MAX(1,M)) THEN
+          INFO = 13
+      END IF
+      IF (INFO.NE.0) THEN
+          CALL XERBLA('DGEMM ',INFO)
+          RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+     +    (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
+*
+*     And if  alpha.eq.zero.
+*
+      IF (ALPHA.EQ.ZERO) THEN
+          IF (BETA.EQ.ZERO) THEN
+              DO 20 J = 1,N
+                  DO 10 I = 1,M
+                      C(I,J) = ZERO
+   10             CONTINUE
+   20         CONTINUE
+          ELSE
+              DO 40 J = 1,N
+                  DO 30 I = 1,M
+                      C(I,J) = BETA*C(I,J)
+   30             CONTINUE
+   40         CONTINUE
+          END IF
+          RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF (NOTB) THEN
+          IF (NOTA) THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+              DO 90 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 50 I = 1,M
+                          C(I,J) = ZERO
+   50                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 60 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+   60                 CONTINUE
+                  END IF
+                  DO 80 L = 1,K
+                      TEMP = ALPHA*B(L,J)
+                      DO 70 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+   70                 CONTINUE
+   80             CONTINUE
+   90         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B + beta*C
+*
+              DO 120 J = 1,N
+                  DO 110 I = 1,M
+                      TEMP = ZERO
+                      DO 100 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(L,J)
+  100                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  110             CONTINUE
+  120         CONTINUE
+          END IF
+      ELSE
+          IF (NOTA) THEN
+*
+*           Form  C := alpha*A*B**T + beta*C
+*
+              DO 170 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 130 I = 1,M
+                          C(I,J) = ZERO
+  130                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 140 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+  140                 CONTINUE
+                  END IF
+                  DO 160 L = 1,K
+                      TEMP = ALPHA*B(J,L)
+                      DO 150 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+  150                 CONTINUE
+  160             CONTINUE
+  170         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B**T + beta*C
+*
+              DO 200 J = 1,N
+                  DO 190 I = 1,M
+                      TEMP = ZERO
+                      DO 180 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(J,L)
+  180                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  190             CONTINUE
+  200         CONTINUE
+          END IF
+      END IF
+*
+      RETURN
+*
+*     End of DGEMM .
+*
+      END
+
+*> \brief \b ZGEMM
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*       .. Scalar Arguments ..
+*       COMPLEX*16 ALPHA,BETA
+*       INTEGER K,LDA,LDB,LDC,M,N
+*       CHARACTER TRANSA,TRANSB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16 A(LDA,*),B(LDB,*),C(LDC,*)
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGEMM  performs one of the matrix-matrix operations
+*>
+*>    C := alpha*op( A )*op( B ) + beta*C,
+*>
+*> where  op( X ) is one of
+*>
+*>    op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+*>
+*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANSA
+*> \verbatim
+*>          TRANSA is CHARACTER*1
+*>           On entry, TRANSA specifies the form of op( A ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSA = 'N' or 'n',  op( A ) = A.
+*>
+*>              TRANSA = 'T' or 't',  op( A ) = A**T.
+*>
+*>              TRANSA = 'C' or 'c',  op( A ) = A**H.
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*>          TRANSB is CHARACTER*1
+*>           On entry, TRANSB specifies the form of op( B ) to be used in
+*>           the matrix multiplication as follows:
+*>
+*>              TRANSB = 'N' or 'n',  op( B ) = B.
+*>
+*>              TRANSB = 'T' or 't',  op( B ) = B**T.
+*>
+*>              TRANSB = 'C' or 'c',  op( B ) = B**H.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>           On entry,  M  specifies  the number  of rows  of the  matrix
+*>           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>           On entry,  N  specifies the number  of columns of the matrix
+*>           op( B ) and the number of columns of the matrix C. N must be
+*>           at least zero.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>           On entry,  K  specifies  the number of columns of the matrix
+*>           op( A ) and the number of rows of the matrix op( B ). K must
+*>           be at least  zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*>          ALPHA is COMPLEX*16
+*>           On entry, ALPHA specifies the scalar alpha.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension ( LDA, ka ), where ka is
+*>           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*>           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*>           part of the array  A  must contain the matrix  A,  otherwise
+*>           the leading  k by m  part of the array  A  must contain  the
+*>           matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>           On entry, LDA specifies the first dimension of A as declared
+*>           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*>           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*>           least  max( 1, k ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension ( LDB, kb ), where kb is
+*>           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*>           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*>           part of the array  B  must contain the matrix  B,  otherwise
+*>           the leading  n by k  part of the array  B  must contain  the
+*>           matrix B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>           On entry, LDB specifies the first dimension of B as declared
+*>           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*>           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*>           least  max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*>          BETA is COMPLEX*16
+*>           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*>           supplied as zero then C need not be set on input.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX*16 array, dimension ( LDC, N )
+*>           Before entry, the leading  m by n  part of the array  C must
+*>           contain the matrix  C,  except when  beta  is zero, in which
+*>           case C need not be set on entry.
+*>           On exit, the array  C  is overwritten by the  m by n  matrix
+*>           ( alpha*op( A )*op( B ) + beta*C ).
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>           On entry, LDC specifies the first dimension of C as declared
+*>           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*>           max( 1, m ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date December 2016
+*
+*> \ingroup complex16_blas_level3
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  Level 3 Blas routine.
+*>
+*>  -- Written on 8-February-1989.
+*>     Jack Dongarra, Argonne National Laboratory.
+*>     Iain Duff, AERE Harwell.
+*>     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>     Sven Hammarling, Numerical Algorithms Group Ltd.
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
+*
+*  -- Reference BLAS level3 routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      COMPLEX*16 ALPHA,BETA
+      INTEGER K,LDA,LDB,LDC,M,N
+      CHARACTER TRANSA,TRANSB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16 A(LDA,*),B(LDB,*),C(LDC,*)
+*     ..
+*
+*  =====================================================================
+*
+*     .. External Functions ..
+      LOGICAL LSAME
+      EXTERNAL LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC DCONJG,MAX
+*     ..
+*     .. Local Scalars ..
+      COMPLEX*16 TEMP
+      INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
+      LOGICAL CONJA,CONJB,NOTA,NOTB
+*     ..
+*     .. Parameters ..
+      COMPLEX*16 ONE
+      PARAMETER (ONE= (1.0D+0,0.0D+0))
+      COMPLEX*16 ZERO
+      PARAMETER (ZERO= (0.0D+0,0.0D+0))
+*     ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+*     B  respectively are to be  transposed but  not conjugated  and set
+*     NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+*     and the number of rows of  B  respectively.
+*
+      NOTA = LSAME(TRANSA,'N')
+      NOTB = LSAME(TRANSB,'N')
+      CONJA = LSAME(TRANSA,'C')
+      CONJB = LSAME(TRANSB,'C')
+      IF (NOTA) THEN
+          NROWA = M
+          NCOLA = K
+      ELSE
+          NROWA = K
+          NCOLA = M
+      END IF
+      IF (NOTB) THEN
+          NROWB = K
+      ELSE
+          NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND.
+     +    (.NOT.LSAME(TRANSA,'T'))) THEN
+          INFO = 1
+      ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND.
+     +         (.NOT.LSAME(TRANSB,'T'))) THEN
+          INFO = 2
+      ELSE IF (M.LT.0) THEN
+          INFO = 3
+      ELSE IF (N.LT.0) THEN
+          INFO = 4
+      ELSE IF (K.LT.0) THEN
+          INFO = 5
+      ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
+          INFO = 8
+      ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
+          INFO = 10
+      ELSE IF (LDC.LT.MAX(1,M)) THEN
+          INFO = 13
+      END IF
+      IF (INFO.NE.0) THEN
+          CALL XERBLA('ZGEMM ',INFO)
+          RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+     +    (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF (ALPHA.EQ.ZERO) THEN
+          IF (BETA.EQ.ZERO) THEN
+              DO 20 J = 1,N
+                  DO 10 I = 1,M
+                      C(I,J) = ZERO
+   10             CONTINUE
+   20         CONTINUE
+          ELSE
+              DO 40 J = 1,N
+                  DO 30 I = 1,M
+                      C(I,J) = BETA*C(I,J)
+   30             CONTINUE
+   40         CONTINUE
+          END IF
+          RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF (NOTB) THEN
+          IF (NOTA) THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+              DO 90 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 50 I = 1,M
+                          C(I,J) = ZERO
+   50                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 60 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+   60                 CONTINUE
+                  END IF
+                  DO 80 L = 1,K
+                      TEMP = ALPHA*B(L,J)
+                      DO 70 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+   70                 CONTINUE
+   80             CONTINUE
+   90         CONTINUE
+          ELSE IF (CONJA) THEN
+*
+*           Form  C := alpha*A**H*B + beta*C.
+*
+              DO 120 J = 1,N
+                  DO 110 I = 1,M
+                      TEMP = ZERO
+                      DO 100 L = 1,K
+                          TEMP = TEMP + DCONJG(A(L,I))*B(L,J)
+  100                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  110             CONTINUE
+  120         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B + beta*C
+*
+              DO 150 J = 1,N
+                  DO 140 I = 1,M
+                      TEMP = ZERO
+                      DO 130 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(L,J)
+  130                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  140             CONTINUE
+  150         CONTINUE
+          END IF
+      ELSE IF (NOTA) THEN
+          IF (CONJB) THEN
+*
+*           Form  C := alpha*A*B**H + beta*C.
+*
+              DO 200 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 160 I = 1,M
+                          C(I,J) = ZERO
+  160                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 170 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+  170                 CONTINUE
+                  END IF
+                  DO 190 L = 1,K
+                      TEMP = ALPHA*DCONJG(B(J,L))
+                      DO 180 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+  180                 CONTINUE
+  190             CONTINUE
+  200         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A*B**T + beta*C
+*
+              DO 250 J = 1,N
+                  IF (BETA.EQ.ZERO) THEN
+                      DO 210 I = 1,M
+                          C(I,J) = ZERO
+  210                 CONTINUE
+                  ELSE IF (BETA.NE.ONE) THEN
+                      DO 220 I = 1,M
+                          C(I,J) = BETA*C(I,J)
+  220                 CONTINUE
+                  END IF
+                  DO 240 L = 1,K
+                      TEMP = ALPHA*B(J,L)
+                      DO 230 I = 1,M
+                          C(I,J) = C(I,J) + TEMP*A(I,L)
+  230                 CONTINUE
+  240             CONTINUE
+  250         CONTINUE
+          END IF
+      ELSE IF (CONJA) THEN
+          IF (CONJB) THEN
+*
+*           Form  C := alpha*A**H*B**H + beta*C.
+*
+              DO 280 J = 1,N
+                  DO 270 I = 1,M
+                      TEMP = ZERO
+                      DO 260 L = 1,K
+                          TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L))
+  260                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  270             CONTINUE
+  280         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**H*B**T + beta*C
+*
+              DO 310 J = 1,N
+                  DO 300 I = 1,M
+                      TEMP = ZERO
+                      DO 290 L = 1,K
+                          TEMP = TEMP + DCONJG(A(L,I))*B(J,L)
+  290                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  300             CONTINUE
+  310         CONTINUE
+          END IF
+      ELSE
+          IF (CONJB) THEN
+*
+*           Form  C := alpha*A**T*B**H + beta*C
+*
+              DO 340 J = 1,N
+                  DO 330 I = 1,M
+                      TEMP = ZERO
+                      DO 320 L = 1,K
+                          TEMP = TEMP + A(L,I)*DCONJG(B(J,L))
+  320                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  330             CONTINUE
+  340         CONTINUE
+          ELSE
+*
+*           Form  C := alpha*A**T*B**T + beta*C
+*
+              DO 370 J = 1,N
+                  DO 360 I = 1,M
+                      TEMP = ZERO
+                      DO 350 L = 1,K
+                          TEMP = TEMP + A(L,I)*B(J,L)
+  350                 CONTINUE
+                      IF (BETA.EQ.ZERO) THEN
+                          C(I,J) = ALPHA*TEMP
+                      ELSE
+                          C(I,J) = ALPHA*TEMP + BETA*C(I,J)
+                      END IF
+  360             CONTINUE
+  370         CONTINUE
+          END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZGEMM .
+*
+      END
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_13.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_13.f90
index 6338aadac3e..8ccfdd5e67d 100644
--- a/gcc/testsuite/gfortran.dg/inline_matmul_13.f90
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_13.f90
@@ -44,4 +44,4 @@ program main
   deallocate(calloc)
 
 end program main
-! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "original" } }
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 1 "original" } }
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_16.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_16.f90
index 9d54094cd90..580cb1ac939 100644
--- a/gcc/testsuite/gfortran.dg/inline_matmul_16.f90
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_16.f90
@@ -58,4 +58,4 @@ program main
      end do
   end do
 end program main
-! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 1 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/matmul_blas_1.f b/gcc/testsuite/gfortran.dg/matmul_blas_1.f
new file mode 100644
index 00000000000..6a88981c9d7
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/matmul_blas_1.f
@@ -0,0 +1,240 @@
+C { dg-do  run }
+C { dg-options "-fcheck=bounds -fdump-tree-optimized -fblas-matmul-limit=1 -O -fexternal-blas" }
+C { dg-additional-sources blas_gemm_routines.f }
+C Test calling of BLAS routines
+
+      program main
+      call sub_s
+      call sub_d
+      call sub_c
+      call sub_z
+      end
+
+      subroutine sub_d
+      implicit none
+      real(8), dimension(3,2) :: a
+      real(8), dimension(2,3) :: at
+      real(8), dimension(2,4) :: b
+      real(8), dimension(4,2) :: bt
+      real(8), dimension(3,4) :: c
+      real(8), dimension(3,4) :: cres
+      real(8), dimension(:,:), allocatable :: c_alloc
+      data a / 2.,  -3.,  5.,  -7., 11., -13./
+      data b /17., -23., 29., -31., 37., -39., 41., -47./
+      data cres /195., -304.,  384.,  275., -428.,  548.,  347., -540.,
+     &           692.,  411., -640.,  816./
+
+      c = matmul(a,b)
+      if (any (c /= cres)) stop 31
+
+      at = transpose(a)
+      c = (1.2,-2.2)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 32
+
+      bt = transpose(b)
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 33
+
+      c_alloc = matmul(a,b)
+      if (any (c /= cres)) stop 34
+
+      at = transpose(a)
+      deallocate (c_alloc)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 35
+
+      bt = transpose(b)
+      allocate (c_alloc(20,20))
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 36
+
+      end
+
+      subroutine sub_s
+      implicit none
+      real, dimension(3,2) :: a
+      real, dimension(2,3) :: at
+      real, dimension(2,4) :: b
+      real, dimension(4,2) :: bt
+      real, dimension(3,4) :: c
+      real, dimension(3,4) :: cres
+      real, dimension(:,:), allocatable :: c_alloc
+      data a / 2.,  -3.,  5.,  -7., 11., -13./
+      data b /17., -23., 29., -31., 37., -39., 41., -47./
+      data cres /195., -304.,  384.,  275., -428.,  548.,  347., -540.,
+     &           692.,  411., -640.,  816./
+
+      c = matmul(a,b)
+      if (any (c /= cres)) stop 21
+
+      at = transpose(a)
+      c = (1.2,-2.2)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 22
+
+      bt = transpose(b)
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 23
+
+      c_alloc = matmul(a,b)
+      if (any (c /= cres)) stop 24
+
+      at = transpose(a)
+      deallocate (c_alloc)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 25
+
+      bt = transpose(b)
+      allocate (c_alloc(20,20))
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 26
+
+      end
+
+      subroutine sub_c
+      implicit none
+      complex, dimension(3,2) :: a
+      complex, dimension(2,3) :: at, ah
+      complex, dimension(2,4) :: b
+      complex, dimension(4,2) :: bt, bh
+      complex, dimension(3,4) :: c
+      complex, dimension(3,4) :: cres
+      complex, dimension(:,:), allocatable :: c_alloc
+      
+      data a / (2.,-3.), (-5.,7.), (11.,-13.), (17.,19), (-23., -29),
+     &     (-31., 37.)/
+
+      data b / (-41., 43.), (-47., 53.), (-59.,-61.), (-67., 71),
+     &     ( 73.,79. ), (83.,-89.), (97.,-101.), (-107.,-109.)/
+      data cres /(-1759.,217.), (2522.,-358.), (-396.,-2376.),
+     &     (-2789.,-11.),
+     &     (4322.,202.), (-1992.,-4584.), (3485.,3.), (-5408.,-244.),
+     &     (2550.,5750.), (143.,-4379.), (-478.,6794.), (7104.,-2952.) /
+
+      c = matmul(a,b)
+      if (any (c /= cres)) stop 1
+
+      at = transpose(a)
+      c = (1.2,-2.2)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 2
+
+      bt = transpose(b)
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 3
+
+      ah = transpose(conjg(a))
+      c = (1.2,-2.2)
+      c = matmul(conjg(transpose(ah)), b)
+      if (any (c /= cres)) stop 4
+
+      bh = transpose(conjg(b))
+      c = (1.2,-2.2)
+      c = matmul(a, transpose(conjg(bh)))
+      if (any (c /= cres)) stop 5
+
+      c_alloc = matmul(a,b)
+      if (any (c /= cres)) stop 6
+
+      at = transpose(a)
+      deallocate (c_alloc)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 7
+
+      bt = transpose(b)
+      allocate (c_alloc(20,20))
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 8
+
+      ah = transpose(conjg(a))
+      c = (1.2,-2.2)
+      c = matmul(conjg(transpose(ah)), b)
+      if (any (c /= cres)) stop 9
+
+      deallocate (c_alloc)
+      allocate (c_alloc(0,0))
+      bh = transpose(conjg(b))
+      c = (1.2,-2.2)
+      c = matmul(a, transpose(conjg(bh)))
+      if (any (c /= cres)) stop 10
+
+      end
+
+      subroutine sub_z
+      implicit none
+      complex(8), dimension(3,2) :: a
+      complex(8), dimension(2,3) :: at, ah
+      complex(8), dimension(2,4) :: b
+      complex(8), dimension(4,2) :: bt, bh
+      complex(8), dimension(3,4) :: c
+      complex(8), dimension(3,4) :: cres
+      complex(8), dimension(:,:), allocatable :: c_alloc
+      
+      data a / (2.,-3.), (-5._8,7.), (11.,-13.), (17.,19),
+     &         (-23., -29), (-31., 37.)/
+
+      data b / (-41., 43.), (-47., 53.), (-59.,-61.), (-67., 71),
+     &     ( 73.,79. ), (83.,-89.), (97.,-101.), (-107.,-109.)/
+      data cres /(-1759.,217.), (2522.,-358.), (-396.,-2376.),
+     &     (-2789.,-11.),
+     &     (4322.,202.), (-1992.,-4584.), (3485.,3.), (-5408.,-244.),
+     &     (2550.,5750.), (143.,-4379.), (-478.,6794.), (7104.,-2952.) /
+
+      c = matmul(a,b)
+      if (any (c /= cres)) stop 11
+
+      at = transpose(a)
+      c = (1.2,-2.2)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 12
+
+      bt = transpose(b)
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 13
+
+      ah = transpose(conjg(a))
+      c = (1.2,-2.2)
+      c = matmul(conjg(transpose(ah)), b)
+      if (any (c /= cres)) stop 14
+
+      bh = transpose(conjg(b))
+      c = (1.2,-2.2)
+      c = matmul(a, transpose(conjg(bh)))
+      if (any (c /= cres)) stop 15
+
+      c_alloc = matmul(a,b)
+      if (any (c /= cres)) stop 16
+
+      at = transpose(a)
+      deallocate (c_alloc)
+      c = matmul(transpose(at), b)
+      if (any (c /= cres)) stop 17
+
+      bt = transpose(b)
+      allocate (c_alloc(20,20))
+      c = (1.2,-2.1)
+      c = matmul(a, transpose(bt))
+      if (any (c /= cres)) stop 18
+
+      ah = transpose(conjg(a))
+      c = (1.2,-2.2)
+      c = matmul(conjg(transpose(ah)), b)
+      if (any (c /= cres)) stop 19
+
+      deallocate (c_alloc)
+      allocate (c_alloc(0,0))
+      bh = transpose(conjg(b))
+      c = (1.2,-2.2)
+      c = matmul(a, transpose(conjg(bh)))
+      if (any (c /= cres)) stop 20
+
+      end
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/matmul_bounds_14.f b/gcc/testsuite/gfortran.dg/matmul_bounds_14.f
new file mode 100644
index 00000000000..4c8a51521d6
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/matmul_bounds_14.f
@@ -0,0 +1,16 @@
+C { dg-do  run }
+C { dg-options "-fno-realloc-lhs -fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
+C { dg-shouldfail "Fortran runtime error: Array bound mismatch for dimension 2 of array." }
+C { dg-additional-sources blas_gemm_routines.f }
+
+      program main
+      real, dimension(3,2) :: a
+      real, dimension(2,3) :: b
+      real, dimension(:,:), allocatable :: ret
+      a = 1.0
+      b = 2.3
+      allocate(ret(3,2))
+      ret = matmul(a,b)         ! This should throw an error.
+      end
+! { dg-output "Fortran runtime error: Array bound mismatch for dimension 2 of array.*" }
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/matmul_bounds_15.f b/gcc/testsuite/gfortran.dg/matmul_bounds_15.f
new file mode 100644
index 00000000000..db4627adb9f
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/matmul_bounds_15.f
@@ -0,0 +1,19 @@
+C { dg-do  run }
+C { dg-options "-fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
+C { dg-shouldfail "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
+C { dg-additional-sources blas_gemm_routines.f }
+      program main
+      character(len=20) :: line
+      integer :: n, m
+      real, dimension(3,2) :: a
+      real, dimension(:,:), allocatable :: b
+      real, dimension(:,:), allocatable :: ret
+      a = 1.0
+      line = '3 3'
+      read (unit=line,fmt=*) n, m
+      allocate (b(n,m))
+      b = 2.3
+      ret = matmul(a,b)         ! This should throw an error.
+      end
+! { dg-output "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/matmul_bounds_16.f b/gcc/testsuite/gfortran.dg/matmul_bounds_16.f
new file mode 100644
index 00000000000..50e91ae4958
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/matmul_bounds_16.f
@@ -0,0 +1,20 @@
+C { dg-do  run }
+C { dg-options "-fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
+C { dg-shouldfail "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1" }
+C { dg-additional-sources blas_gemm_routines.f }
+
+      program main
+      character(len=20) :: line
+      integer :: n, m
+      real, dimension(3,2) :: a
+      real, dimension(:,:), allocatable :: b
+      real, dimension(:,:), allocatable :: ret
+      a = 1.0
+      line = '4 3'
+      read (unit=line,fmt=*) n, m
+      allocate (b(n,m))
+      b = 2.3
+      ret = matmul(transpose(a),b)         ! This should throw an error.
+      end
+! { dg-output "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/promotion_2.f90 b/gcc/testsuite/gfortran.dg/promotion_2.f90
index 7e3c6c92010..83fc0627677 100644
--- a/gcc/testsuite/gfortran.dg/promotion_2.f90
+++ b/gcc/testsuite/gfortran.dg/promotion_2.f90
@@ -1,5 +1,5 @@
 ! { dg-do compile }
-! { dg-options "-fdefault-real-8 -fexternal-blas -fdump-tree-original -finline-matmul-limit=0" }
+! { dg-options "-fdefault-real-8 -fexternal-blas -fblas-matmul-limit=1 -fdump-tree-original -finline-matmul-limit=0" }
 !
 ! PR fortran/54463
 !
@@ -8,8 +8,9 @@
 program test
   implicit none
   real, dimension(3,3) :: A
+  call random_number(a)
   A = matmul(A,A)
 end program test
 
-! { dg-final { scan-tree-dump-times "sgemm_" 0 "original" } }
-! { dg-final { scan-tree-dump-times "dgemm_" 1 "original" } }
+! { dg-final { scan-tree-dump-times "sgemm" 0 "original" } }
+! { dg-final { scan-tree-dump-times "dgemm" 1 "original" } }
-- 
2.30.2