1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
31 #if defined (HAVE_GFC_INTEGER_2)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
37 typedef void (*blas_call
)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_INTEGER_2
*, const GFC_INTEGER_2
*,
39 const int *, const GFC_INTEGER_2
*, const int *,
40 const GFC_INTEGER_2
*, GFC_INTEGER_2
*, const int *,
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_i2 (gfc_array_i2
* const restrict retarray
,
73 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
74 int blas_limit
, blas_call gemm
);
75 export_proto(matmul_i2
);
77 /* Put exhaustive list of possible architectures here here, ORed together. */
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
83 matmul_i2_avx (gfc_array_i2
* const restrict retarray
,
84 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
85 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx")));
87 matmul_i2_avx (gfc_array_i2
* const restrict retarray
,
88 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
89 int blas_limit
, blas_call gemm
)
91 const GFC_INTEGER_2
* restrict abase
;
92 const GFC_INTEGER_2
* restrict bbase
;
93 GFC_INTEGER_2
* restrict dest
;
95 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
96 index_type x
, y
, n
, count
, xcount
, ycount
;
98 assert (GFC_DESCRIPTOR_RANK (a
) == 2
99 || GFC_DESCRIPTOR_RANK (b
) == 2);
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
103 Either A or B (but not both) can be rank 1:
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
112 if (retarray
->base_addr
== NULL
)
114 if (GFC_DESCRIPTOR_RANK (a
) == 1)
116 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
119 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
121 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
126 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray
,0));
135 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_2
));
136 retarray
->offset
= 0;
138 else if (unlikely (compile_options
.bounds_check
))
140 index_type ret_extent
, arg_extent
;
142 if (GFC_DESCRIPTOR_RANK (a
) == 1)
144 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
145 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
146 if (arg_extent
!= ret_extent
)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent
, (long int) arg_extent
);
151 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
153 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
154 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
155 if (arg_extent
!= ret_extent
)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent
, (long int) arg_extent
);
162 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
163 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
164 if (arg_extent
!= ret_extent
)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent
, (long int) arg_extent
);
170 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
171 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
172 if (arg_extent
!= ret_extent
)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent
, (long int) arg_extent
);
181 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
186 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
190 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
191 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
195 if (GFC_DESCRIPTOR_RANK (a
) == 1)
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
202 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
206 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
207 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
209 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
210 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
213 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
215 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
219 if (GFC_DESCRIPTOR_RANK (b
) == 1)
221 /* Treat it as a column matrix B[count,1] */
222 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride
= 0xDEADBEEF;
232 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
233 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
234 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
237 abase
= a
->base_addr
;
238 bbase
= b
->base_addr
;
239 dest
= retarray
->base_addr
;
241 /* Now that everything is set up, we perform the multiplication
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
248 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
249 && (bxstride
== 1 || bystride
== 1)
250 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
253 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
254 const GFC_INTEGER_2 one
= 1, zero
= 0;
255 const int lda
= (axstride
== 1) ? aystride
: axstride
,
256 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
258 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
260 assert (gemm
!= NULL
);
261 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
262 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
268 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
276 S-901 87 Umea, Sweden
278 from netlib.org, translated to C, and modified for matmul.m4. */
280 const GFC_INTEGER_2
*a
, *b
;
282 const index_type m
= xcount
, n
= ycount
, k
= count
;
284 /* System generated locals */
285 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
286 i1
, i2
, i3
, i4
, i5
, i6
;
288 /* Local variables */
289 GFC_INTEGER_2 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
290 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
291 index_type i
, j
, l
, ii
, jj
, ll
;
292 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
297 c
= retarray
->base_addr
;
299 /* Parameter adjustments */
301 c_offset
= 1 + c_dim1
;
304 a_offset
= 1 + a_dim1
;
307 b_offset
= 1 + b_dim1
;
310 /* Early exit if possible */
311 if (m
== 0 || n
== 0 || k
== 0)
314 /* Adjust size of t1 to what is needed. */
316 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
320 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_2
));
325 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_2
)0;
327 /* Start turning the crank. */
329 for (jj
= 1; jj
<= i1
; jj
+= 512)
335 ujsec
= jsec
- jsec
% 4;
337 for (ll
= 1; ll
<= i2
; ll
+= 256)
343 ulsec
= lsec
- lsec
% 2;
346 for (ii
= 1; ii
<= i3
; ii
+= 256)
352 uisec
= isec
- isec
% 2;
354 for (l
= ll
; l
<= i4
; l
+= 2)
357 for (i
= ii
; i
<= i5
; i
+= 2)
359 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
361 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
362 a
[i
+ (l
+ 1) * a_dim1
];
363 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
364 a
[i
+ 1 + l
* a_dim1
];
365 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
366 a
[i
+ 1 + (l
+ 1) * a_dim1
];
370 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
371 a
[ii
+ isec
- 1 + l
* a_dim1
];
372 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
373 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
379 for (i
= ii
; i
<= i4
; ++i
)
381 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
382 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
386 uisec
= isec
- isec
% 4;
388 for (j
= jj
; j
<= i4
; j
+= 4)
391 for (i
= ii
; i
<= i5
; i
+= 4)
393 f11
= c
[i
+ j
* c_dim1
];
394 f21
= c
[i
+ 1 + j
* c_dim1
];
395 f12
= c
[i
+ (j
+ 1) * c_dim1
];
396 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
397 f13
= c
[i
+ (j
+ 2) * c_dim1
];
398 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
399 f14
= c
[i
+ (j
+ 3) * c_dim1
];
400 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
401 f31
= c
[i
+ 2 + j
* c_dim1
];
402 f41
= c
[i
+ 3 + j
* c_dim1
];
403 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
404 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
405 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
406 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
407 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
408 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
410 for (l
= ll
; l
<= i6
; ++l
)
412 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
414 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
416 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
417 * b
[l
+ (j
+ 1) * b_dim1
];
418 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
419 * b
[l
+ (j
+ 1) * b_dim1
];
420 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
421 * b
[l
+ (j
+ 2) * b_dim1
];
422 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
423 * b
[l
+ (j
+ 2) * b_dim1
];
424 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
425 * b
[l
+ (j
+ 3) * b_dim1
];
426 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
427 * b
[l
+ (j
+ 3) * b_dim1
];
428 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
430 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
432 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
433 * b
[l
+ (j
+ 1) * b_dim1
];
434 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
435 * b
[l
+ (j
+ 1) * b_dim1
];
436 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
437 * b
[l
+ (j
+ 2) * b_dim1
];
438 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
439 * b
[l
+ (j
+ 2) * b_dim1
];
440 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
441 * b
[l
+ (j
+ 3) * b_dim1
];
442 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
443 * b
[l
+ (j
+ 3) * b_dim1
];
445 c
[i
+ j
* c_dim1
] = f11
;
446 c
[i
+ 1 + j
* c_dim1
] = f21
;
447 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
448 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
449 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
450 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
451 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
452 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
453 c
[i
+ 2 + j
* c_dim1
] = f31
;
454 c
[i
+ 3 + j
* c_dim1
] = f41
;
455 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
456 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
457 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
458 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
459 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
460 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
465 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
467 f11
= c
[i
+ j
* c_dim1
];
468 f12
= c
[i
+ (j
+ 1) * c_dim1
];
469 f13
= c
[i
+ (j
+ 2) * c_dim1
];
470 f14
= c
[i
+ (j
+ 3) * c_dim1
];
472 for (l
= ll
; l
<= i6
; ++l
)
474 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
475 257] * b
[l
+ j
* b_dim1
];
476 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
477 257] * b
[l
+ (j
+ 1) * b_dim1
];
478 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
479 257] * b
[l
+ (j
+ 2) * b_dim1
];
480 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
481 257] * b
[l
+ (j
+ 3) * b_dim1
];
483 c
[i
+ j
* c_dim1
] = f11
;
484 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
485 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
486 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
493 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
496 for (i
= ii
; i
<= i5
; i
+= 4)
498 f11
= c
[i
+ j
* c_dim1
];
499 f21
= c
[i
+ 1 + j
* c_dim1
];
500 f31
= c
[i
+ 2 + j
* c_dim1
];
501 f41
= c
[i
+ 3 + j
* c_dim1
];
503 for (l
= ll
; l
<= i6
; ++l
)
505 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
506 257] * b
[l
+ j
* b_dim1
];
507 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
508 257] * b
[l
+ j
* b_dim1
];
509 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
510 257] * b
[l
+ j
* b_dim1
];
511 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
512 257] * b
[l
+ j
* b_dim1
];
514 c
[i
+ j
* c_dim1
] = f11
;
515 c
[i
+ 1 + j
* c_dim1
] = f21
;
516 c
[i
+ 2 + j
* c_dim1
] = f31
;
517 c
[i
+ 3 + j
* c_dim1
] = f41
;
520 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
522 f11
= c
[i
+ j
* c_dim1
];
524 for (l
= ll
; l
<= i6
; ++l
)
526 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
527 257] * b
[l
+ j
* b_dim1
];
529 c
[i
+ j
* c_dim1
] = f11
;
539 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
541 if (GFC_DESCRIPTOR_RANK (a
) != 1)
543 const GFC_INTEGER_2
*restrict abase_x
;
544 const GFC_INTEGER_2
*restrict bbase_y
;
545 GFC_INTEGER_2
*restrict dest_y
;
548 for (y
= 0; y
< ycount
; y
++)
550 bbase_y
= &bbase
[y
*bystride
];
551 dest_y
= &dest
[y
*rystride
];
552 for (x
= 0; x
< xcount
; x
++)
554 abase_x
= &abase
[x
*axstride
];
555 s
= (GFC_INTEGER_2
) 0;
556 for (n
= 0; n
< count
; n
++)
557 s
+= abase_x
[n
] * bbase_y
[n
];
564 const GFC_INTEGER_2
*restrict bbase_y
;
567 for (y
= 0; y
< ycount
; y
++)
569 bbase_y
= &bbase
[y
*bystride
];
570 s
= (GFC_INTEGER_2
) 0;
571 for (n
= 0; n
< count
; n
++)
572 s
+= abase
[n
*axstride
] * bbase_y
[n
];
573 dest
[y
*rystride
] = s
;
577 else if (axstride
< aystride
)
579 for (y
= 0; y
< ycount
; y
++)
580 for (x
= 0; x
< xcount
; x
++)
581 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_2
)0;
583 for (y
= 0; y
< ycount
; y
++)
584 for (n
= 0; n
< count
; n
++)
585 for (x
= 0; x
< xcount
; x
++)
586 /* dest[x,y] += a[x,n] * b[n,y] */
587 dest
[x
*rxstride
+ y
*rystride
] +=
588 abase
[x
*axstride
+ n
*aystride
] *
589 bbase
[n
*bxstride
+ y
*bystride
];
591 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
593 const GFC_INTEGER_2
*restrict bbase_y
;
596 for (y
= 0; y
< ycount
; y
++)
598 bbase_y
= &bbase
[y
*bystride
];
599 s
= (GFC_INTEGER_2
) 0;
600 for (n
= 0; n
< count
; n
++)
601 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
602 dest
[y
*rxstride
] = s
;
607 const GFC_INTEGER_2
*restrict abase_x
;
608 const GFC_INTEGER_2
*restrict bbase_y
;
609 GFC_INTEGER_2
*restrict dest_y
;
612 for (y
= 0; y
< ycount
; y
++)
614 bbase_y
= &bbase
[y
*bystride
];
615 dest_y
= &dest
[y
*rystride
];
616 for (x
= 0; x
< xcount
; x
++)
618 abase_x
= &abase
[x
*axstride
];
619 s
= (GFC_INTEGER_2
) 0;
620 for (n
= 0; n
< count
; n
++)
621 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
622 dest_y
[x
*rxstride
] = s
;
631 #endif /* HAVE_AVX */
635 matmul_i2_avx2 (gfc_array_i2
* const restrict retarray
,
636 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
637 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx2,fma")));
639 matmul_i2_avx2 (gfc_array_i2
* const restrict retarray
,
640 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
641 int blas_limit
, blas_call gemm
)
643 const GFC_INTEGER_2
* restrict abase
;
644 const GFC_INTEGER_2
* restrict bbase
;
645 GFC_INTEGER_2
* restrict dest
;
647 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
648 index_type x
, y
, n
, count
, xcount
, ycount
;
650 assert (GFC_DESCRIPTOR_RANK (a
) == 2
651 || GFC_DESCRIPTOR_RANK (b
) == 2);
653 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
655 Either A or B (but not both) can be rank 1:
657 o One-dimensional argument A is implicitly treated as a row matrix
658 dimensioned [1,count], so xcount=1.
660 o One-dimensional argument B is implicitly treated as a column matrix
661 dimensioned [count, 1], so ycount=1.
664 if (retarray
->base_addr
== NULL
)
666 if (GFC_DESCRIPTOR_RANK (a
) == 1)
668 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
669 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
671 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
673 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
674 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
678 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
681 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
682 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
683 GFC_DESCRIPTOR_EXTENT(retarray
,0));
687 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_2
));
688 retarray
->offset
= 0;
690 else if (unlikely (compile_options
.bounds_check
))
692 index_type ret_extent
, arg_extent
;
694 if (GFC_DESCRIPTOR_RANK (a
) == 1)
696 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
697 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
698 if (arg_extent
!= ret_extent
)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent
, (long int) arg_extent
);
703 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
705 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
706 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
707 if (arg_extent
!= ret_extent
)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic: is %ld, should be %ld",
710 (long int) ret_extent
, (long int) arg_extent
);
714 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
715 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
716 if (arg_extent
!= ret_extent
)
717 runtime_error ("Incorrect extent in return array in"
718 " MATMUL intrinsic for dimension 1:"
719 " is %ld, should be %ld",
720 (long int) ret_extent
, (long int) arg_extent
);
722 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
723 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
724 if (arg_extent
!= ret_extent
)
725 runtime_error ("Incorrect extent in return array in"
726 " MATMUL intrinsic for dimension 2:"
727 " is %ld, should be %ld",
728 (long int) ret_extent
, (long int) arg_extent
);
733 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
735 /* One-dimensional result may be addressed in the code below
736 either as a row or a column matrix. We want both cases to
738 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
742 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
743 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
747 if (GFC_DESCRIPTOR_RANK (a
) == 1)
749 /* Treat it as a a row matrix A[1,count]. */
750 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
754 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
758 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
759 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
761 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
762 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
765 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
767 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
768 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
771 if (GFC_DESCRIPTOR_RANK (b
) == 1)
773 /* Treat it as a column matrix B[count,1] */
774 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
776 /* bystride should never be used for 1-dimensional b.
777 in case it is we want it to cause a segfault, rather than
778 an incorrect result. */
779 bystride
= 0xDEADBEEF;
784 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
785 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
786 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
789 abase
= a
->base_addr
;
790 bbase
= b
->base_addr
;
791 dest
= retarray
->base_addr
;
793 /* Now that everything is set up, we perform the multiplication
796 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
797 #define min(a,b) ((a) <= (b) ? (a) : (b))
798 #define max(a,b) ((a) >= (b) ? (a) : (b))
800 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
801 && (bxstride
== 1 || bystride
== 1)
802 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
805 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
806 const GFC_INTEGER_2 one
= 1, zero
= 0;
807 const int lda
= (axstride
== 1) ? aystride
: axstride
,
808 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
810 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
812 assert (gemm
!= NULL
);
813 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
814 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
820 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
822 /* This block of code implements a tuned matmul, derived from
823 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
825 Bo Kagstrom and Per Ling
826 Department of Computing Science
828 S-901 87 Umea, Sweden
830 from netlib.org, translated to C, and modified for matmul.m4. */
832 const GFC_INTEGER_2
*a
, *b
;
834 const index_type m
= xcount
, n
= ycount
, k
= count
;
836 /* System generated locals */
837 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
838 i1
, i2
, i3
, i4
, i5
, i6
;
840 /* Local variables */
841 GFC_INTEGER_2 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
842 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
843 index_type i
, j
, l
, ii
, jj
, ll
;
844 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
849 c
= retarray
->base_addr
;
851 /* Parameter adjustments */
853 c_offset
= 1 + c_dim1
;
856 a_offset
= 1 + a_dim1
;
859 b_offset
= 1 + b_dim1
;
862 /* Early exit if possible */
863 if (m
== 0 || n
== 0 || k
== 0)
866 /* Adjust size of t1 to what is needed. */
868 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
872 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_2
));
877 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_2
)0;
879 /* Start turning the crank. */
881 for (jj
= 1; jj
<= i1
; jj
+= 512)
887 ujsec
= jsec
- jsec
% 4;
889 for (ll
= 1; ll
<= i2
; ll
+= 256)
895 ulsec
= lsec
- lsec
% 2;
898 for (ii
= 1; ii
<= i3
; ii
+= 256)
904 uisec
= isec
- isec
% 2;
906 for (l
= ll
; l
<= i4
; l
+= 2)
909 for (i
= ii
; i
<= i5
; i
+= 2)
911 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
913 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
914 a
[i
+ (l
+ 1) * a_dim1
];
915 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
916 a
[i
+ 1 + l
* a_dim1
];
917 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
918 a
[i
+ 1 + (l
+ 1) * a_dim1
];
922 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
923 a
[ii
+ isec
- 1 + l
* a_dim1
];
924 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
925 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
931 for (i
= ii
; i
<= i4
; ++i
)
933 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
934 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
938 uisec
= isec
- isec
% 4;
940 for (j
= jj
; j
<= i4
; j
+= 4)
943 for (i
= ii
; i
<= i5
; i
+= 4)
945 f11
= c
[i
+ j
* c_dim1
];
946 f21
= c
[i
+ 1 + j
* c_dim1
];
947 f12
= c
[i
+ (j
+ 1) * c_dim1
];
948 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
949 f13
= c
[i
+ (j
+ 2) * c_dim1
];
950 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
951 f14
= c
[i
+ (j
+ 3) * c_dim1
];
952 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
953 f31
= c
[i
+ 2 + j
* c_dim1
];
954 f41
= c
[i
+ 3 + j
* c_dim1
];
955 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
956 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
957 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
958 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
959 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
960 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
962 for (l
= ll
; l
<= i6
; ++l
)
964 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
966 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
968 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
969 * b
[l
+ (j
+ 1) * b_dim1
];
970 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
971 * b
[l
+ (j
+ 1) * b_dim1
];
972 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
973 * b
[l
+ (j
+ 2) * b_dim1
];
974 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
975 * b
[l
+ (j
+ 2) * b_dim1
];
976 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
977 * b
[l
+ (j
+ 3) * b_dim1
];
978 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
979 * b
[l
+ (j
+ 3) * b_dim1
];
980 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
982 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
984 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
985 * b
[l
+ (j
+ 1) * b_dim1
];
986 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
987 * b
[l
+ (j
+ 1) * b_dim1
];
988 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
989 * b
[l
+ (j
+ 2) * b_dim1
];
990 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
991 * b
[l
+ (j
+ 2) * b_dim1
];
992 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
993 * b
[l
+ (j
+ 3) * b_dim1
];
994 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
995 * b
[l
+ (j
+ 3) * b_dim1
];
997 c
[i
+ j
* c_dim1
] = f11
;
998 c
[i
+ 1 + j
* c_dim1
] = f21
;
999 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1000 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1001 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1002 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1003 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1004 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1005 c
[i
+ 2 + j
* c_dim1
] = f31
;
1006 c
[i
+ 3 + j
* c_dim1
] = f41
;
1007 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1008 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1009 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1010 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1011 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1012 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1017 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1019 f11
= c
[i
+ j
* c_dim1
];
1020 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1021 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1022 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1024 for (l
= ll
; l
<= i6
; ++l
)
1026 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1027 257] * b
[l
+ j
* b_dim1
];
1028 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1029 257] * b
[l
+ (j
+ 1) * b_dim1
];
1030 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1031 257] * b
[l
+ (j
+ 2) * b_dim1
];
1032 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1033 257] * b
[l
+ (j
+ 3) * b_dim1
];
1035 c
[i
+ j
* c_dim1
] = f11
;
1036 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1037 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1038 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1045 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1047 i5
= ii
+ uisec
- 1;
1048 for (i
= ii
; i
<= i5
; i
+= 4)
1050 f11
= c
[i
+ j
* c_dim1
];
1051 f21
= c
[i
+ 1 + j
* c_dim1
];
1052 f31
= c
[i
+ 2 + j
* c_dim1
];
1053 f41
= c
[i
+ 3 + j
* c_dim1
];
1055 for (l
= ll
; l
<= i6
; ++l
)
1057 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1058 257] * b
[l
+ j
* b_dim1
];
1059 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1060 257] * b
[l
+ j
* b_dim1
];
1061 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1062 257] * b
[l
+ j
* b_dim1
];
1063 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1064 257] * b
[l
+ j
* b_dim1
];
1066 c
[i
+ j
* c_dim1
] = f11
;
1067 c
[i
+ 1 + j
* c_dim1
] = f21
;
1068 c
[i
+ 2 + j
* c_dim1
] = f31
;
1069 c
[i
+ 3 + j
* c_dim1
] = f41
;
1072 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1074 f11
= c
[i
+ j
* c_dim1
];
1076 for (l
= ll
; l
<= i6
; ++l
)
1078 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1079 257] * b
[l
+ j
* b_dim1
];
1081 c
[i
+ j
* c_dim1
] = f11
;
1091 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1093 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1095 const GFC_INTEGER_2
*restrict abase_x
;
1096 const GFC_INTEGER_2
*restrict bbase_y
;
1097 GFC_INTEGER_2
*restrict dest_y
;
1100 for (y
= 0; y
< ycount
; y
++)
1102 bbase_y
= &bbase
[y
*bystride
];
1103 dest_y
= &dest
[y
*rystride
];
1104 for (x
= 0; x
< xcount
; x
++)
1106 abase_x
= &abase
[x
*axstride
];
1107 s
= (GFC_INTEGER_2
) 0;
1108 for (n
= 0; n
< count
; n
++)
1109 s
+= abase_x
[n
] * bbase_y
[n
];
1116 const GFC_INTEGER_2
*restrict bbase_y
;
1119 for (y
= 0; y
< ycount
; y
++)
1121 bbase_y
= &bbase
[y
*bystride
];
1122 s
= (GFC_INTEGER_2
) 0;
1123 for (n
= 0; n
< count
; n
++)
1124 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1125 dest
[y
*rystride
] = s
;
1129 else if (axstride
< aystride
)
1131 for (y
= 0; y
< ycount
; y
++)
1132 for (x
= 0; x
< xcount
; x
++)
1133 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_2
)0;
1135 for (y
= 0; y
< ycount
; y
++)
1136 for (n
= 0; n
< count
; n
++)
1137 for (x
= 0; x
< xcount
; x
++)
1138 /* dest[x,y] += a[x,n] * b[n,y] */
1139 dest
[x
*rxstride
+ y
*rystride
] +=
1140 abase
[x
*axstride
+ n
*aystride
] *
1141 bbase
[n
*bxstride
+ y
*bystride
];
1143 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1145 const GFC_INTEGER_2
*restrict bbase_y
;
1148 for (y
= 0; y
< ycount
; y
++)
1150 bbase_y
= &bbase
[y
*bystride
];
1151 s
= (GFC_INTEGER_2
) 0;
1152 for (n
= 0; n
< count
; n
++)
1153 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1154 dest
[y
*rxstride
] = s
;
1159 const GFC_INTEGER_2
*restrict abase_x
;
1160 const GFC_INTEGER_2
*restrict bbase_y
;
1161 GFC_INTEGER_2
*restrict dest_y
;
1164 for (y
= 0; y
< ycount
; y
++)
1166 bbase_y
= &bbase
[y
*bystride
];
1167 dest_y
= &dest
[y
*rystride
];
1168 for (x
= 0; x
< xcount
; x
++)
1170 abase_x
= &abase
[x
*axstride
];
1171 s
= (GFC_INTEGER_2
) 0;
1172 for (n
= 0; n
< count
; n
++)
1173 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1174 dest_y
[x
*rxstride
] = s
;
1183 #endif /* HAVE_AVX2 */
1187 matmul_i2_avx512f (gfc_array_i2
* const restrict retarray
,
1188 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
1189 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx512f")));
1191 matmul_i2_avx512f (gfc_array_i2
* const restrict retarray
,
1192 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
1193 int blas_limit
, blas_call gemm
)
1195 const GFC_INTEGER_2
* restrict abase
;
1196 const GFC_INTEGER_2
* restrict bbase
;
1197 GFC_INTEGER_2
* restrict dest
;
1199 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1200 index_type x
, y
, n
, count
, xcount
, ycount
;
1202 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1203 || GFC_DESCRIPTOR_RANK (b
) == 2);
1205 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1207 Either A or B (but not both) can be rank 1:
1209 o One-dimensional argument A is implicitly treated as a row matrix
1210 dimensioned [1,count], so xcount=1.
1212 o One-dimensional argument B is implicitly treated as a column matrix
1213 dimensioned [count, 1], so ycount=1.
1216 if (retarray
->base_addr
== NULL
)
1218 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1220 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1221 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1223 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1225 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1226 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1230 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1233 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1234 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1235 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1239 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_2
));
1240 retarray
->offset
= 0;
1242 else if (unlikely (compile_options
.bounds_check
))
1244 index_type ret_extent
, arg_extent
;
1246 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1248 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1249 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1250 if (arg_extent
!= ret_extent
)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic: is %ld, should be %ld",
1253 (long int) ret_extent
, (long int) arg_extent
);
1255 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1257 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1258 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1259 if (arg_extent
!= ret_extent
)
1260 runtime_error ("Incorrect extent in return array in"
1261 " MATMUL intrinsic: is %ld, should be %ld",
1262 (long int) ret_extent
, (long int) arg_extent
);
1266 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1267 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1268 if (arg_extent
!= ret_extent
)
1269 runtime_error ("Incorrect extent in return array in"
1270 " MATMUL intrinsic for dimension 1:"
1271 " is %ld, should be %ld",
1272 (long int) ret_extent
, (long int) arg_extent
);
1274 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1275 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1276 if (arg_extent
!= ret_extent
)
1277 runtime_error ("Incorrect extent in return array in"
1278 " MATMUL intrinsic for dimension 2:"
1279 " is %ld, should be %ld",
1280 (long int) ret_extent
, (long int) arg_extent
);
1285 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1287 /* One-dimensional result may be addressed in the code below
1288 either as a row or a column matrix. We want both cases to
1290 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1294 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1295 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1299 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1301 /* Treat it as a a row matrix A[1,count]. */
1302 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1306 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1310 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1311 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1313 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1314 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1317 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1319 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1320 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1323 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1325 /* Treat it as a column matrix B[count,1] */
1326 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1328 /* bystride should never be used for 1-dimensional b.
1329 in case it is we want it to cause a segfault, rather than
1330 an incorrect result. */
1331 bystride
= 0xDEADBEEF;
1336 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1337 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1338 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1341 abase
= a
->base_addr
;
1342 bbase
= b
->base_addr
;
1343 dest
= retarray
->base_addr
;
1345 /* Now that everything is set up, we perform the multiplication
1348 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1349 #define min(a,b) ((a) <= (b) ? (a) : (b))
1350 #define max(a,b) ((a) >= (b) ? (a) : (b))
1352 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1353 && (bxstride
== 1 || bystride
== 1)
1354 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1355 > POW3(blas_limit
)))
1357 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1358 const GFC_INTEGER_2 one
= 1, zero
= 0;
1359 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1360 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1362 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1364 assert (gemm
!= NULL
);
1365 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1366 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1372 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1374 /* This block of code implements a tuned matmul, derived from
1375 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1377 Bo Kagstrom and Per Ling
1378 Department of Computing Science
1380 S-901 87 Umea, Sweden
1382 from netlib.org, translated to C, and modified for matmul.m4. */
1384 const GFC_INTEGER_2
*a
, *b
;
1386 const index_type m
= xcount
, n
= ycount
, k
= count
;
1388 /* System generated locals */
1389 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1390 i1
, i2
, i3
, i4
, i5
, i6
;
1392 /* Local variables */
1393 GFC_INTEGER_2 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1394 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1395 index_type i
, j
, l
, ii
, jj
, ll
;
1396 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1401 c
= retarray
->base_addr
;
1403 /* Parameter adjustments */
1405 c_offset
= 1 + c_dim1
;
1408 a_offset
= 1 + a_dim1
;
1411 b_offset
= 1 + b_dim1
;
1414 /* Early exit if possible */
1415 if (m
== 0 || n
== 0 || k
== 0)
1418 /* Adjust size of t1 to what is needed. */
1420 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
1424 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_2
));
1426 /* Empty c first. */
1427 for (j
=1; j
<=n
; j
++)
1428 for (i
=1; i
<=m
; i
++)
1429 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_2
)0;
1431 /* Start turning the crank. */
1433 for (jj
= 1; jj
<= i1
; jj
+= 512)
1439 ujsec
= jsec
- jsec
% 4;
1441 for (ll
= 1; ll
<= i2
; ll
+= 256)
1447 ulsec
= lsec
- lsec
% 2;
1450 for (ii
= 1; ii
<= i3
; ii
+= 256)
1456 uisec
= isec
- isec
% 2;
1457 i4
= ll
+ ulsec
- 1;
1458 for (l
= ll
; l
<= i4
; l
+= 2)
1460 i5
= ii
+ uisec
- 1;
1461 for (i
= ii
; i
<= i5
; i
+= 2)
1463 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1465 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1466 a
[i
+ (l
+ 1) * a_dim1
];
1467 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1468 a
[i
+ 1 + l
* a_dim1
];
1469 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1470 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1474 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1475 a
[ii
+ isec
- 1 + l
* a_dim1
];
1476 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1477 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1483 for (i
= ii
; i
<= i4
; ++i
)
1485 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
1486 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
1490 uisec
= isec
- isec
% 4;
1491 i4
= jj
+ ujsec
- 1;
1492 for (j
= jj
; j
<= i4
; j
+= 4)
1494 i5
= ii
+ uisec
- 1;
1495 for (i
= ii
; i
<= i5
; i
+= 4)
1497 f11
= c
[i
+ j
* c_dim1
];
1498 f21
= c
[i
+ 1 + j
* c_dim1
];
1499 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1500 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
1501 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1502 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
1503 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1504 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
1505 f31
= c
[i
+ 2 + j
* c_dim1
];
1506 f41
= c
[i
+ 3 + j
* c_dim1
];
1507 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
1508 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
1509 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
1510 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
1511 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
1512 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
1514 for (l
= ll
; l
<= i6
; ++l
)
1516 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1517 * b
[l
+ j
* b_dim1
];
1518 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1519 * b
[l
+ j
* b_dim1
];
1520 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1521 * b
[l
+ (j
+ 1) * b_dim1
];
1522 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1523 * b
[l
+ (j
+ 1) * b_dim1
];
1524 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1525 * b
[l
+ (j
+ 2) * b_dim1
];
1526 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1527 * b
[l
+ (j
+ 2) * b_dim1
];
1528 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1529 * b
[l
+ (j
+ 3) * b_dim1
];
1530 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1531 * b
[l
+ (j
+ 3) * b_dim1
];
1532 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1533 * b
[l
+ j
* b_dim1
];
1534 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1535 * b
[l
+ j
* b_dim1
];
1536 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1537 * b
[l
+ (j
+ 1) * b_dim1
];
1538 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1539 * b
[l
+ (j
+ 1) * b_dim1
];
1540 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1541 * b
[l
+ (j
+ 2) * b_dim1
];
1542 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1543 * b
[l
+ (j
+ 2) * b_dim1
];
1544 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1545 * b
[l
+ (j
+ 3) * b_dim1
];
1546 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1547 * b
[l
+ (j
+ 3) * b_dim1
];
1549 c
[i
+ j
* c_dim1
] = f11
;
1550 c
[i
+ 1 + j
* c_dim1
] = f21
;
1551 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1552 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1553 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1554 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1555 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1556 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1557 c
[i
+ 2 + j
* c_dim1
] = f31
;
1558 c
[i
+ 3 + j
* c_dim1
] = f41
;
1559 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1560 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1561 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1562 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1563 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1564 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1569 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1571 f11
= c
[i
+ j
* c_dim1
];
1572 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1573 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1574 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1576 for (l
= ll
; l
<= i6
; ++l
)
1578 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1579 257] * b
[l
+ j
* b_dim1
];
1580 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1581 257] * b
[l
+ (j
+ 1) * b_dim1
];
1582 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1583 257] * b
[l
+ (j
+ 2) * b_dim1
];
1584 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1585 257] * b
[l
+ (j
+ 3) * b_dim1
];
1587 c
[i
+ j
* c_dim1
] = f11
;
1588 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1589 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1590 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1597 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1599 i5
= ii
+ uisec
- 1;
1600 for (i
= ii
; i
<= i5
; i
+= 4)
1602 f11
= c
[i
+ j
* c_dim1
];
1603 f21
= c
[i
+ 1 + j
* c_dim1
];
1604 f31
= c
[i
+ 2 + j
* c_dim1
];
1605 f41
= c
[i
+ 3 + j
* c_dim1
];
1607 for (l
= ll
; l
<= i6
; ++l
)
1609 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1610 257] * b
[l
+ j
* b_dim1
];
1611 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1612 257] * b
[l
+ j
* b_dim1
];
1613 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1614 257] * b
[l
+ j
* b_dim1
];
1615 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1616 257] * b
[l
+ j
* b_dim1
];
1618 c
[i
+ j
* c_dim1
] = f11
;
1619 c
[i
+ 1 + j
* c_dim1
] = f21
;
1620 c
[i
+ 2 + j
* c_dim1
] = f31
;
1621 c
[i
+ 3 + j
* c_dim1
] = f41
;
1624 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1626 f11
= c
[i
+ j
* c_dim1
];
1628 for (l
= ll
; l
<= i6
; ++l
)
1630 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1631 257] * b
[l
+ j
* b_dim1
];
1633 c
[i
+ j
* c_dim1
] = f11
;
1643 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1645 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1647 const GFC_INTEGER_2
*restrict abase_x
;
1648 const GFC_INTEGER_2
*restrict bbase_y
;
1649 GFC_INTEGER_2
*restrict dest_y
;
1652 for (y
= 0; y
< ycount
; y
++)
1654 bbase_y
= &bbase
[y
*bystride
];
1655 dest_y
= &dest
[y
*rystride
];
1656 for (x
= 0; x
< xcount
; x
++)
1658 abase_x
= &abase
[x
*axstride
];
1659 s
= (GFC_INTEGER_2
) 0;
1660 for (n
= 0; n
< count
; n
++)
1661 s
+= abase_x
[n
] * bbase_y
[n
];
1668 const GFC_INTEGER_2
*restrict bbase_y
;
1671 for (y
= 0; y
< ycount
; y
++)
1673 bbase_y
= &bbase
[y
*bystride
];
1674 s
= (GFC_INTEGER_2
) 0;
1675 for (n
= 0; n
< count
; n
++)
1676 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1677 dest
[y
*rystride
] = s
;
1681 else if (axstride
< aystride
)
1683 for (y
= 0; y
< ycount
; y
++)
1684 for (x
= 0; x
< xcount
; x
++)
1685 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_2
)0;
1687 for (y
= 0; y
< ycount
; y
++)
1688 for (n
= 0; n
< count
; n
++)
1689 for (x
= 0; x
< xcount
; x
++)
1690 /* dest[x,y] += a[x,n] * b[n,y] */
1691 dest
[x
*rxstride
+ y
*rystride
] +=
1692 abase
[x
*axstride
+ n
*aystride
] *
1693 bbase
[n
*bxstride
+ y
*bystride
];
1695 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1697 const GFC_INTEGER_2
*restrict bbase_y
;
1700 for (y
= 0; y
< ycount
; y
++)
1702 bbase_y
= &bbase
[y
*bystride
];
1703 s
= (GFC_INTEGER_2
) 0;
1704 for (n
= 0; n
< count
; n
++)
1705 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1706 dest
[y
*rxstride
] = s
;
1711 const GFC_INTEGER_2
*restrict abase_x
;
1712 const GFC_INTEGER_2
*restrict bbase_y
;
1713 GFC_INTEGER_2
*restrict dest_y
;
1716 for (y
= 0; y
< ycount
; y
++)
1718 bbase_y
= &bbase
[y
*bystride
];
1719 dest_y
= &dest
[y
*rystride
];
1720 for (x
= 0; x
< xcount
; x
++)
1722 abase_x
= &abase
[x
*axstride
];
1723 s
= (GFC_INTEGER_2
) 0;
1724 for (n
= 0; n
< count
; n
++)
1725 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1726 dest_y
[x
*rxstride
] = s
;
1735 #endif /* HAVE_AVX512F */
1737 /* Function to fall back to if there is no special processor-specific version. */
1739 matmul_i2_vanilla (gfc_array_i2
* const restrict retarray
,
1740 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
1741 int blas_limit
, blas_call gemm
)
1743 const GFC_INTEGER_2
* restrict abase
;
1744 const GFC_INTEGER_2
* restrict bbase
;
1745 GFC_INTEGER_2
* restrict dest
;
1747 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1748 index_type x
, y
, n
, count
, xcount
, ycount
;
1750 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1751 || GFC_DESCRIPTOR_RANK (b
) == 2);
1753 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1755 Either A or B (but not both) can be rank 1:
1757 o One-dimensional argument A is implicitly treated as a row matrix
1758 dimensioned [1,count], so xcount=1.
1760 o One-dimensional argument B is implicitly treated as a column matrix
1761 dimensioned [count, 1], so ycount=1.
1764 if (retarray
->base_addr
== NULL
)
1766 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1768 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1769 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1771 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1773 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1774 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1778 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1779 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1781 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1782 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1783 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1787 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_2
));
1788 retarray
->offset
= 0;
1790 else if (unlikely (compile_options
.bounds_check
))
1792 index_type ret_extent
, arg_extent
;
1794 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1796 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1797 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1798 if (arg_extent
!= ret_extent
)
1799 runtime_error ("Incorrect extent in return array in"
1800 " MATMUL intrinsic: is %ld, should be %ld",
1801 (long int) ret_extent
, (long int) arg_extent
);
1803 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1805 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1806 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1807 if (arg_extent
!= ret_extent
)
1808 runtime_error ("Incorrect extent in return array in"
1809 " MATMUL intrinsic: is %ld, should be %ld",
1810 (long int) ret_extent
, (long int) arg_extent
);
1814 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1815 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1816 if (arg_extent
!= ret_extent
)
1817 runtime_error ("Incorrect extent in return array in"
1818 " MATMUL intrinsic for dimension 1:"
1819 " is %ld, should be %ld",
1820 (long int) ret_extent
, (long int) arg_extent
);
1822 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1823 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1824 if (arg_extent
!= ret_extent
)
1825 runtime_error ("Incorrect extent in return array in"
1826 " MATMUL intrinsic for dimension 2:"
1827 " is %ld, should be %ld",
1828 (long int) ret_extent
, (long int) arg_extent
);
1833 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1835 /* One-dimensional result may be addressed in the code below
1836 either as a row or a column matrix. We want both cases to
1838 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1842 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1843 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1847 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1849 /* Treat it as a a row matrix A[1,count]. */
1850 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1854 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1858 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1859 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1861 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1862 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1865 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1867 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1868 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1871 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1873 /* Treat it as a column matrix B[count,1] */
1874 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1876 /* bystride should never be used for 1-dimensional b.
1877 in case it is we want it to cause a segfault, rather than
1878 an incorrect result. */
1879 bystride
= 0xDEADBEEF;
1884 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1885 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1886 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1889 abase
= a
->base_addr
;
1890 bbase
= b
->base_addr
;
1891 dest
= retarray
->base_addr
;
1893 /* Now that everything is set up, we perform the multiplication
1896 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1897 #define min(a,b) ((a) <= (b) ? (a) : (b))
1898 #define max(a,b) ((a) >= (b) ? (a) : (b))
1900 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1901 && (bxstride
== 1 || bystride
== 1)
1902 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1903 > POW3(blas_limit
)))
1905 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1906 const GFC_INTEGER_2 one
= 1, zero
= 0;
1907 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1908 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1910 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1912 assert (gemm
!= NULL
);
1913 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1914 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1920 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1922 /* This block of code implements a tuned matmul, derived from
1923 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1925 Bo Kagstrom and Per Ling
1926 Department of Computing Science
1928 S-901 87 Umea, Sweden
1930 from netlib.org, translated to C, and modified for matmul.m4. */
1932 const GFC_INTEGER_2
*a
, *b
;
1934 const index_type m
= xcount
, n
= ycount
, k
= count
;
1936 /* System generated locals */
1937 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1938 i1
, i2
, i3
, i4
, i5
, i6
;
1940 /* Local variables */
1941 GFC_INTEGER_2 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1942 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1943 index_type i
, j
, l
, ii
, jj
, ll
;
1944 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1949 c
= retarray
->base_addr
;
1951 /* Parameter adjustments */
1953 c_offset
= 1 + c_dim1
;
1956 a_offset
= 1 + a_dim1
;
1959 b_offset
= 1 + b_dim1
;
1962 /* Early exit if possible */
1963 if (m
== 0 || n
== 0 || k
== 0)
1966 /* Adjust size of t1 to what is needed. */
1968 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
1972 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_2
));
1974 /* Empty c first. */
1975 for (j
=1; j
<=n
; j
++)
1976 for (i
=1; i
<=m
; i
++)
1977 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_2
)0;
1979 /* Start turning the crank. */
1981 for (jj
= 1; jj
<= i1
; jj
+= 512)
1987 ujsec
= jsec
- jsec
% 4;
1989 for (ll
= 1; ll
<= i2
; ll
+= 256)
1995 ulsec
= lsec
- lsec
% 2;
1998 for (ii
= 1; ii
<= i3
; ii
+= 256)
2004 uisec
= isec
- isec
% 2;
2005 i4
= ll
+ ulsec
- 1;
2006 for (l
= ll
; l
<= i4
; l
+= 2)
2008 i5
= ii
+ uisec
- 1;
2009 for (i
= ii
; i
<= i5
; i
+= 2)
2011 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2013 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2014 a
[i
+ (l
+ 1) * a_dim1
];
2015 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2016 a
[i
+ 1 + l
* a_dim1
];
2017 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2018 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2022 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2023 a
[ii
+ isec
- 1 + l
* a_dim1
];
2024 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2025 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2031 for (i
= ii
; i
<= i4
; ++i
)
2033 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2034 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2038 uisec
= isec
- isec
% 4;
2039 i4
= jj
+ ujsec
- 1;
2040 for (j
= jj
; j
<= i4
; j
+= 4)
2042 i5
= ii
+ uisec
- 1;
2043 for (i
= ii
; i
<= i5
; i
+= 4)
2045 f11
= c
[i
+ j
* c_dim1
];
2046 f21
= c
[i
+ 1 + j
* c_dim1
];
2047 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2048 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2049 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2050 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2051 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2052 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2053 f31
= c
[i
+ 2 + j
* c_dim1
];
2054 f41
= c
[i
+ 3 + j
* c_dim1
];
2055 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2056 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2057 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2058 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2059 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2060 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2062 for (l
= ll
; l
<= i6
; ++l
)
2064 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2065 * b
[l
+ j
* b_dim1
];
2066 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2067 * b
[l
+ j
* b_dim1
];
2068 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2069 * b
[l
+ (j
+ 1) * b_dim1
];
2070 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2071 * b
[l
+ (j
+ 1) * b_dim1
];
2072 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2073 * b
[l
+ (j
+ 2) * b_dim1
];
2074 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2075 * b
[l
+ (j
+ 2) * b_dim1
];
2076 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2077 * b
[l
+ (j
+ 3) * b_dim1
];
2078 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2079 * b
[l
+ (j
+ 3) * b_dim1
];
2080 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2081 * b
[l
+ j
* b_dim1
];
2082 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2083 * b
[l
+ j
* b_dim1
];
2084 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2085 * b
[l
+ (j
+ 1) * b_dim1
];
2086 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2087 * b
[l
+ (j
+ 1) * b_dim1
];
2088 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2089 * b
[l
+ (j
+ 2) * b_dim1
];
2090 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2091 * b
[l
+ (j
+ 2) * b_dim1
];
2092 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2093 * b
[l
+ (j
+ 3) * b_dim1
];
2094 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2095 * b
[l
+ (j
+ 3) * b_dim1
];
2097 c
[i
+ j
* c_dim1
] = f11
;
2098 c
[i
+ 1 + j
* c_dim1
] = f21
;
2099 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2100 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2101 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2102 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2103 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2104 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2105 c
[i
+ 2 + j
* c_dim1
] = f31
;
2106 c
[i
+ 3 + j
* c_dim1
] = f41
;
2107 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2108 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2109 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2110 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2111 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2112 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2117 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2119 f11
= c
[i
+ j
* c_dim1
];
2120 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2121 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2122 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2124 for (l
= ll
; l
<= i6
; ++l
)
2126 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2127 257] * b
[l
+ j
* b_dim1
];
2128 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2129 257] * b
[l
+ (j
+ 1) * b_dim1
];
2130 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2131 257] * b
[l
+ (j
+ 2) * b_dim1
];
2132 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2133 257] * b
[l
+ (j
+ 3) * b_dim1
];
2135 c
[i
+ j
* c_dim1
] = f11
;
2136 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2137 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2138 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2145 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2147 i5
= ii
+ uisec
- 1;
2148 for (i
= ii
; i
<= i5
; i
+= 4)
2150 f11
= c
[i
+ j
* c_dim1
];
2151 f21
= c
[i
+ 1 + j
* c_dim1
];
2152 f31
= c
[i
+ 2 + j
* c_dim1
];
2153 f41
= c
[i
+ 3 + j
* c_dim1
];
2155 for (l
= ll
; l
<= i6
; ++l
)
2157 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2158 257] * b
[l
+ j
* b_dim1
];
2159 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2160 257] * b
[l
+ j
* b_dim1
];
2161 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2162 257] * b
[l
+ j
* b_dim1
];
2163 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2164 257] * b
[l
+ j
* b_dim1
];
2166 c
[i
+ j
* c_dim1
] = f11
;
2167 c
[i
+ 1 + j
* c_dim1
] = f21
;
2168 c
[i
+ 2 + j
* c_dim1
] = f31
;
2169 c
[i
+ 3 + j
* c_dim1
] = f41
;
2172 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2174 f11
= c
[i
+ j
* c_dim1
];
2176 for (l
= ll
; l
<= i6
; ++l
)
2178 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2179 257] * b
[l
+ j
* b_dim1
];
2181 c
[i
+ j
* c_dim1
] = f11
;
2191 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2193 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2195 const GFC_INTEGER_2
*restrict abase_x
;
2196 const GFC_INTEGER_2
*restrict bbase_y
;
2197 GFC_INTEGER_2
*restrict dest_y
;
2200 for (y
= 0; y
< ycount
; y
++)
2202 bbase_y
= &bbase
[y
*bystride
];
2203 dest_y
= &dest
[y
*rystride
];
2204 for (x
= 0; x
< xcount
; x
++)
2206 abase_x
= &abase
[x
*axstride
];
2207 s
= (GFC_INTEGER_2
) 0;
2208 for (n
= 0; n
< count
; n
++)
2209 s
+= abase_x
[n
] * bbase_y
[n
];
2216 const GFC_INTEGER_2
*restrict bbase_y
;
2219 for (y
= 0; y
< ycount
; y
++)
2221 bbase_y
= &bbase
[y
*bystride
];
2222 s
= (GFC_INTEGER_2
) 0;
2223 for (n
= 0; n
< count
; n
++)
2224 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2225 dest
[y
*rystride
] = s
;
2229 else if (axstride
< aystride
)
2231 for (y
= 0; y
< ycount
; y
++)
2232 for (x
= 0; x
< xcount
; x
++)
2233 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_2
)0;
2235 for (y
= 0; y
< ycount
; y
++)
2236 for (n
= 0; n
< count
; n
++)
2237 for (x
= 0; x
< xcount
; x
++)
2238 /* dest[x,y] += a[x,n] * b[n,y] */
2239 dest
[x
*rxstride
+ y
*rystride
] +=
2240 abase
[x
*axstride
+ n
*aystride
] *
2241 bbase
[n
*bxstride
+ y
*bystride
];
2243 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2245 const GFC_INTEGER_2
*restrict bbase_y
;
2248 for (y
= 0; y
< ycount
; y
++)
2250 bbase_y
= &bbase
[y
*bystride
];
2251 s
= (GFC_INTEGER_2
) 0;
2252 for (n
= 0; n
< count
; n
++)
2253 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2254 dest
[y
*rxstride
] = s
;
2259 const GFC_INTEGER_2
*restrict abase_x
;
2260 const GFC_INTEGER_2
*restrict bbase_y
;
2261 GFC_INTEGER_2
*restrict dest_y
;
2264 for (y
= 0; y
< ycount
; y
++)
2266 bbase_y
= &bbase
[y
*bystride
];
2267 dest_y
= &dest
[y
*rystride
];
2268 for (x
= 0; x
< xcount
; x
++)
2270 abase_x
= &abase
[x
*axstride
];
2271 s
= (GFC_INTEGER_2
) 0;
2272 for (n
= 0; n
< count
; n
++)
2273 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2274 dest_y
[x
*rxstride
] = s
;
2284 /* Compiling main function, with selection code for the processor. */
2286 /* Currently, this is i386 only. Adjust for other architectures. */
2288 #include <config/i386/cpuinfo.h>
2289 void matmul_i2 (gfc_array_i2
* const restrict retarray
,
2290 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
2291 int blas_limit
, blas_call gemm
)
2293 static void (*matmul_p
) (gfc_array_i2
* const restrict retarray
,
2294 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
2295 int blas_limit
, blas_call gemm
);
2297 void (*matmul_fn
) (gfc_array_i2
* const restrict retarray
,
2298 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
2299 int blas_limit
, blas_call gemm
);
2301 matmul_fn
= __atomic_load_n (&matmul_p
, __ATOMIC_RELAXED
);
2302 if (matmul_fn
== NULL
)
2304 matmul_fn
= matmul_i2_vanilla
;
2305 if (__cpu_model
.__cpu_vendor
== VENDOR_INTEL
)
2307 /* Run down the available processors in order of preference. */
2309 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX512F
))
2311 matmul_fn
= matmul_i2_avx512f
;
2315 #endif /* HAVE_AVX512F */
2318 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX2
))
2319 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA
)))
2321 matmul_fn
= matmul_i2_avx2
;
2328 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2330 matmul_fn
= matmul_i2_avx
;
2333 #endif /* HAVE_AVX */
2336 __atomic_store_n (&matmul_p
, matmul_fn
, __ATOMIC_RELAXED
);
2339 (*matmul_fn
) (retarray
, a
, b
, try_blas
, blas_limit
, gemm
);
2342 #else /* Just the vanilla function. */
2345 matmul_i2 (gfc_array_i2
* const restrict retarray
,
2346 gfc_array_i2
* const restrict a
, gfc_array_i2
* const restrict b
, int try_blas
,
2347 int blas_limit
, blas_call gemm
)
2349 const GFC_INTEGER_2
* restrict abase
;
2350 const GFC_INTEGER_2
* restrict bbase
;
2351 GFC_INTEGER_2
* restrict dest
;
2353 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
2354 index_type x
, y
, n
, count
, xcount
, ycount
;
2356 assert (GFC_DESCRIPTOR_RANK (a
) == 2
2357 || GFC_DESCRIPTOR_RANK (b
) == 2);
2359 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2361 Either A or B (but not both) can be rank 1:
2363 o One-dimensional argument A is implicitly treated as a row matrix
2364 dimensioned [1,count], so xcount=1.
2366 o One-dimensional argument B is implicitly treated as a column matrix
2367 dimensioned [count, 1], so ycount=1.
2370 if (retarray
->base_addr
== NULL
)
2372 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2374 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2375 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
2377 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2379 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2380 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2384 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2385 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2387 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
2388 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
2389 GFC_DESCRIPTOR_EXTENT(retarray
,0));
2393 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_2
));
2394 retarray
->offset
= 0;
2396 else if (unlikely (compile_options
.bounds_check
))
2398 index_type ret_extent
, arg_extent
;
2400 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2402 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2403 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2404 if (arg_extent
!= ret_extent
)
2405 runtime_error ("Incorrect extent in return array in"
2406 " MATMUL intrinsic: is %ld, should be %ld",
2407 (long int) ret_extent
, (long int) arg_extent
);
2409 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2411 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2412 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2413 if (arg_extent
!= ret_extent
)
2414 runtime_error ("Incorrect extent in return array in"
2415 " MATMUL intrinsic: is %ld, should be %ld",
2416 (long int) ret_extent
, (long int) arg_extent
);
2420 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2421 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2422 if (arg_extent
!= ret_extent
)
2423 runtime_error ("Incorrect extent in return array in"
2424 " MATMUL intrinsic for dimension 1:"
2425 " is %ld, should be %ld",
2426 (long int) ret_extent
, (long int) arg_extent
);
2428 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2429 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
2430 if (arg_extent
!= ret_extent
)
2431 runtime_error ("Incorrect extent in return array in"
2432 " MATMUL intrinsic for dimension 2:"
2433 " is %ld, should be %ld",
2434 (long int) ret_extent
, (long int) arg_extent
);
2439 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
2441 /* One-dimensional result may be addressed in the code below
2442 either as a row or a column matrix. We want both cases to
2444 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2448 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2449 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
2453 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2455 /* Treat it as a a row matrix A[1,count]. */
2456 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2460 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
2464 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2465 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
2467 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
2468 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
2471 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
2473 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
2474 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2477 if (GFC_DESCRIPTOR_RANK (b
) == 1)
2479 /* Treat it as a column matrix B[count,1] */
2480 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2482 /* bystride should never be used for 1-dimensional b.
2483 in case it is we want it to cause a segfault, rather than
2484 an incorrect result. */
2485 bystride
= 0xDEADBEEF;
2490 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2491 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
2492 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
2495 abase
= a
->base_addr
;
2496 bbase
= b
->base_addr
;
2497 dest
= retarray
->base_addr
;
2499 /* Now that everything is set up, we perform the multiplication
2502 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2503 #define min(a,b) ((a) <= (b) ? (a) : (b))
2504 #define max(a,b) ((a) >= (b) ? (a) : (b))
2506 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
2507 && (bxstride
== 1 || bystride
== 1)
2508 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
2509 > POW3(blas_limit
)))
2511 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
2512 const GFC_INTEGER_2 one
= 1, zero
= 0;
2513 const int lda
= (axstride
== 1) ? aystride
: axstride
,
2514 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
2516 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
2518 assert (gemm
!= NULL
);
2519 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
2520 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
2526 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
2528 /* This block of code implements a tuned matmul, derived from
2529 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2531 Bo Kagstrom and Per Ling
2532 Department of Computing Science
2534 S-901 87 Umea, Sweden
2536 from netlib.org, translated to C, and modified for matmul.m4. */
2538 const GFC_INTEGER_2
*a
, *b
;
2540 const index_type m
= xcount
, n
= ycount
, k
= count
;
2542 /* System generated locals */
2543 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
2544 i1
, i2
, i3
, i4
, i5
, i6
;
2546 /* Local variables */
2547 GFC_INTEGER_2 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
2548 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
2549 index_type i
, j
, l
, ii
, jj
, ll
;
2550 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
2555 c
= retarray
->base_addr
;
2557 /* Parameter adjustments */
2559 c_offset
= 1 + c_dim1
;
2562 a_offset
= 1 + a_dim1
;
2565 b_offset
= 1 + b_dim1
;
2568 /* Early exit if possible */
2569 if (m
== 0 || n
== 0 || k
== 0)
2572 /* Adjust size of t1 to what is needed. */
2574 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
2578 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_2
));
2580 /* Empty c first. */
2581 for (j
=1; j
<=n
; j
++)
2582 for (i
=1; i
<=m
; i
++)
2583 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_2
)0;
2585 /* Start turning the crank. */
2587 for (jj
= 1; jj
<= i1
; jj
+= 512)
2593 ujsec
= jsec
- jsec
% 4;
2595 for (ll
= 1; ll
<= i2
; ll
+= 256)
2601 ulsec
= lsec
- lsec
% 2;
2604 for (ii
= 1; ii
<= i3
; ii
+= 256)
2610 uisec
= isec
- isec
% 2;
2611 i4
= ll
+ ulsec
- 1;
2612 for (l
= ll
; l
<= i4
; l
+= 2)
2614 i5
= ii
+ uisec
- 1;
2615 for (i
= ii
; i
<= i5
; i
+= 2)
2617 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2619 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2620 a
[i
+ (l
+ 1) * a_dim1
];
2621 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2622 a
[i
+ 1 + l
* a_dim1
];
2623 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2624 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2628 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2629 a
[ii
+ isec
- 1 + l
* a_dim1
];
2630 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2631 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2637 for (i
= ii
; i
<= i4
; ++i
)
2639 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2640 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2644 uisec
= isec
- isec
% 4;
2645 i4
= jj
+ ujsec
- 1;
2646 for (j
= jj
; j
<= i4
; j
+= 4)
2648 i5
= ii
+ uisec
- 1;
2649 for (i
= ii
; i
<= i5
; i
+= 4)
2651 f11
= c
[i
+ j
* c_dim1
];
2652 f21
= c
[i
+ 1 + j
* c_dim1
];
2653 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2654 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2655 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2656 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2657 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2658 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2659 f31
= c
[i
+ 2 + j
* c_dim1
];
2660 f41
= c
[i
+ 3 + j
* c_dim1
];
2661 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2662 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2663 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2664 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2665 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2666 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2668 for (l
= ll
; l
<= i6
; ++l
)
2670 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2671 * b
[l
+ j
* b_dim1
];
2672 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2673 * b
[l
+ j
* b_dim1
];
2674 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2675 * b
[l
+ (j
+ 1) * b_dim1
];
2676 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2677 * b
[l
+ (j
+ 1) * b_dim1
];
2678 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2679 * b
[l
+ (j
+ 2) * b_dim1
];
2680 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2681 * b
[l
+ (j
+ 2) * b_dim1
];
2682 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2683 * b
[l
+ (j
+ 3) * b_dim1
];
2684 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2685 * b
[l
+ (j
+ 3) * b_dim1
];
2686 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2687 * b
[l
+ j
* b_dim1
];
2688 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2689 * b
[l
+ j
* b_dim1
];
2690 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2691 * b
[l
+ (j
+ 1) * b_dim1
];
2692 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2693 * b
[l
+ (j
+ 1) * b_dim1
];
2694 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2695 * b
[l
+ (j
+ 2) * b_dim1
];
2696 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2697 * b
[l
+ (j
+ 2) * b_dim1
];
2698 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2699 * b
[l
+ (j
+ 3) * b_dim1
];
2700 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2701 * b
[l
+ (j
+ 3) * b_dim1
];
2703 c
[i
+ j
* c_dim1
] = f11
;
2704 c
[i
+ 1 + j
* c_dim1
] = f21
;
2705 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2706 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2707 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2708 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2709 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2710 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2711 c
[i
+ 2 + j
* c_dim1
] = f31
;
2712 c
[i
+ 3 + j
* c_dim1
] = f41
;
2713 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2714 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2715 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2716 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2717 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2718 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2723 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2725 f11
= c
[i
+ j
* c_dim1
];
2726 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2727 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2728 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2730 for (l
= ll
; l
<= i6
; ++l
)
2732 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2733 257] * b
[l
+ j
* b_dim1
];
2734 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2735 257] * b
[l
+ (j
+ 1) * b_dim1
];
2736 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2737 257] * b
[l
+ (j
+ 2) * b_dim1
];
2738 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2739 257] * b
[l
+ (j
+ 3) * b_dim1
];
2741 c
[i
+ j
* c_dim1
] = f11
;
2742 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2743 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2744 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2751 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2753 i5
= ii
+ uisec
- 1;
2754 for (i
= ii
; i
<= i5
; i
+= 4)
2756 f11
= c
[i
+ j
* c_dim1
];
2757 f21
= c
[i
+ 1 + j
* c_dim1
];
2758 f31
= c
[i
+ 2 + j
* c_dim1
];
2759 f41
= c
[i
+ 3 + j
* c_dim1
];
2761 for (l
= ll
; l
<= i6
; ++l
)
2763 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2764 257] * b
[l
+ j
* b_dim1
];
2765 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2766 257] * b
[l
+ j
* b_dim1
];
2767 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2768 257] * b
[l
+ j
* b_dim1
];
2769 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2770 257] * b
[l
+ j
* b_dim1
];
2772 c
[i
+ j
* c_dim1
] = f11
;
2773 c
[i
+ 1 + j
* c_dim1
] = f21
;
2774 c
[i
+ 2 + j
* c_dim1
] = f31
;
2775 c
[i
+ 3 + j
* c_dim1
] = f41
;
2778 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2780 f11
= c
[i
+ j
* c_dim1
];
2782 for (l
= ll
; l
<= i6
; ++l
)
2784 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2785 257] * b
[l
+ j
* b_dim1
];
2787 c
[i
+ j
* c_dim1
] = f11
;
2797 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2799 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2801 const GFC_INTEGER_2
*restrict abase_x
;
2802 const GFC_INTEGER_2
*restrict bbase_y
;
2803 GFC_INTEGER_2
*restrict dest_y
;
2806 for (y
= 0; y
< ycount
; y
++)
2808 bbase_y
= &bbase
[y
*bystride
];
2809 dest_y
= &dest
[y
*rystride
];
2810 for (x
= 0; x
< xcount
; x
++)
2812 abase_x
= &abase
[x
*axstride
];
2813 s
= (GFC_INTEGER_2
) 0;
2814 for (n
= 0; n
< count
; n
++)
2815 s
+= abase_x
[n
] * bbase_y
[n
];
2822 const GFC_INTEGER_2
*restrict bbase_y
;
2825 for (y
= 0; y
< ycount
; y
++)
2827 bbase_y
= &bbase
[y
*bystride
];
2828 s
= (GFC_INTEGER_2
) 0;
2829 for (n
= 0; n
< count
; n
++)
2830 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2831 dest
[y
*rystride
] = s
;
2835 else if (axstride
< aystride
)
2837 for (y
= 0; y
< ycount
; y
++)
2838 for (x
= 0; x
< xcount
; x
++)
2839 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_2
)0;
2841 for (y
= 0; y
< ycount
; y
++)
2842 for (n
= 0; n
< count
; n
++)
2843 for (x
= 0; x
< xcount
; x
++)
2844 /* dest[x,y] += a[x,n] * b[n,y] */
2845 dest
[x
*rxstride
+ y
*rystride
] +=
2846 abase
[x
*axstride
+ n
*aystride
] *
2847 bbase
[n
*bxstride
+ y
*bystride
];
2849 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2851 const GFC_INTEGER_2
*restrict bbase_y
;
2854 for (y
= 0; y
< ycount
; y
++)
2856 bbase_y
= &bbase
[y
*bystride
];
2857 s
= (GFC_INTEGER_2
) 0;
2858 for (n
= 0; n
< count
; n
++)
2859 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2860 dest
[y
*rxstride
] = s
;
2865 const GFC_INTEGER_2
*restrict abase_x
;
2866 const GFC_INTEGER_2
*restrict bbase_y
;
2867 GFC_INTEGER_2
*restrict dest_y
;
2870 for (y
= 0; y
< ycount
; y
++)
2872 bbase_y
= &bbase
[y
*bystride
];
2873 dest_y
= &dest
[y
*rystride
];
2874 for (x
= 0; x
< xcount
; x
++)
2876 abase_x
= &abase
[x
*axstride
];
2877 s
= (GFC_INTEGER_2
) 0;
2878 for (n
= 0; n
< count
; n
++)
2879 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2880 dest_y
[x
*rxstride
] = s
;