计算矩阵乘法的函数之一是 cblas_sgemm,使用单精度实数,另外还有对应双精度实数,单精度复数和双精度复数的函数。在此以 cblas_sgemm为例。
函数定义为:
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const float alpha, const float *A,
const int lda, const float *B, const int ldb,
const float beta, float *C, const int ldc)
得到的结果是:
C = alpha*op( A )*op( B ) + beta*C
const enum CBLAS_ORDER Order,这是指的数据的存储形式,在CBLAS的函数中无论一维还是二维数据都是用一维数组存储,这就要涉及是行主序还是列主序,在C语言中数组是用 行主序,fortran中是列主序。我还是习惯于是用行主序,所以这个参数是用CblasRowMajor,如果是列主序的话就是 CblasColMajor。
const int M,矩阵A的行,矩阵C的行
const int N,矩阵B的列,矩阵C的列
const int K,矩阵A的列,矩阵B的行
const float alpha, const float beta,计算公式中的两个参数值,如果只是计算C=A*B,则alpha=1,beta=0
const float *A, const float *B, const float *C,矩阵ABC的数据
const int lda, const int ldb, const int ldc,在BLAS的文档里,这三个参数分别为ABC的行数,但是实际使用发现,在CBLAS里应该是列数。
我在这里计算两个简单矩阵的乘法。
A:
1,2,3
4,5,6
7,8,9
8,7,6
B:
5,4
3,2
1,0
程序代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 //因为程序是C++,而CBLAS是C语言写的,所以在此处用extern关键字 extern "C" { #include } #include using namespace std; int main(void) { const enum CBLAS_ORDER Order=CblasRowMajor; const enum CBLAS_TRANSPOSE TransA=CblasNoTrans; const enum CBLAS_TRANSPOSE TransB=CblasNoTrans; const int M=4;//A的行数,C的行数 const int N=2;//B的列数,C的列数 const int K=3;//A的列数,B的行数 const float alpha=1; const float beta=0; const int lda=K;//A的列 const int ldb=N;//B的列 const int ldc=N;//C的列 const float A[K*M]={1,2,3,4,5,6,7,8,9,8,7,6}; const float B[K*N]={5,4,3,2,1,0}; float C[M*N]; cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); for(int i=0;i<</FONT>M;i++) { for(int j=0;j<</FONT>N;j++) { cout<<C[i*N+j]<<"\t"; } cout<<endl; } return EXIT_SUCCESS; }
#include <stdio.h> #include <string.h> #include <ctype.h> #include <stdarg.h> #include <math.h> #include "mkl_cblas.h" //#include <stdlib.h> #include "mkl.h" #define MAX_GRP_COUNT 5 int main() { float *A, *B, *C; int m, n, k, i, j; float alpha, beta; printf("\n This example computes real matrix C=alpha*A*B+beta*C using \n" " Intel(R) MKL function dgemm, where A, B, and C are matrices and \n" " alpha and beta are double precision scalars\n\n"); //m = 2000, k = 200, n = 1000; m = 2, k = 4, n = 3; printf(" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n); alpha = 1.0; beta = 0.0; printf(" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (float *)mkl_malloc(m*k*sizeof(float), 64); B = (float *)mkl_malloc(k*n*sizeof(float), 64); C = (float *)mkl_malloc(m*n*sizeof(float), 64); if (A == NULL || B == NULL || C == NULL) { printf("\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf(" Intializing matrix data \n\n"); for (i = 0; i < (m*k); i++) { A[i] = (float)(i + 1); } for (i = 0; i < (k*n); i++) { B[i] = (float)(-i - 1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; } printf(" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n"); cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, alpha, A, k, B, k, beta, C, n); printf("\n Computations completed.\n\n"); printf(" Top left corner of matrix A: \n"); for (i = 0; i<min(m, 6); i++) { for (j = 0; j<min(k, 6); j++) { printf(".0f", A[j + i*k]); } printf("\n"); } printf("\n Top left corner of matrix B: \n"); for (i = 0; i<min(k, 6); i++) { for (j = 0; j<min(n, 6); j++) { printf(".0f", B[j + i*n]); } printf("\n"); } printf("\n Top left corner of matrix C: \n"); for (i = 0; i<min(m, 6); i++) { for (j = 0; j<min(n, 6); j++) { printf(".5G", C[j + i*n]); } printf("\n"); } printf("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); printf(" Example completed. \n\n"); return 0; }