I wanted to use my own implementation of the Strassen optimization for matrix multiplication. It is highly optimized and hence has almost no pedagogical use but then I remembered that Wikipedia has actual C code in the Strassen-matrix-multiplication entry.
The implementation there is not the best:
- it has no fallback to the naive algorithm if the size falls below a certain limit
- only matrix sizes are allowed that are a power of two
- it accumulates a lot of error if used with finite floating point data types
- it uses a lot of memory, more than necessary
- it needs some work to make it parallelizable
But it works, so …
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MM_OK 1
#define MM_ERR 0
void mat_print(double **M, int dim);
double **mat_init(int dim);
void mat_clear(double **M, int dim);
double **mat_rand(int dim, int seed);
int mat_mul_naive(double **A, double **B, double **C, int dim);
int mat_add(double **A, double **B, double **C, int dim);
int mat_sub(double **A, double **B, double **C, int dim);
int mat_mul_strassen(double **A, double **B, double **C, int dim);
// ALL CHECKS OMMITTED!
// Octave/Matlab format, for getting a second opinion
void mat_print(double **M, int dim)
{
int i, j;
printf("[");
for (i = 0; i < dim; i++) {
for (j = 0; j < dim; j++) {
printf("%.10g", M[i][j]);
if (j < dim - 1) {
putc(',', stdout);
}
}
if (i < dim - 1) {
putc(';', stdout);
}
}
printf("]\n");
}
double **mat_init(int dim)
{
int i, j;
double **M;
M = malloc(dim * sizeof(double *));
if (M == NULL) {
return NULL;
}
for (i = 0; i < dim; i++) {
M[i] = malloc(dim * sizeof(double));
if (M[i] == NULL) {
mat_clear(M, i);
return NULL;
}
for (j = 0; j < dim; j++) {
M[i][j] = 0;
}
}
return M;
}
void mat_clear(double **M, int dim)
{
int i;
for (i = 0; i < dim; i++) {
free(M[i]);
}
free(M);
}
double **mat_rand(int dim, int seed)
{
int i, j;
double **M;
M = mat_init(dim);
if (M == NULL) {
return NULL;
}
srand(seed);
for (i = 0; i < dim; i++) {
for (j = 0; j < dim; j++) {
M[i][j] = (double) rand() / (double) rand();
}
}
return M;
}
int mat_mul_naive(double **A, double **B, double **C, int dim)
{
int i, j, k;
if (C == NULL) {
return MM_ERR;
}
for (i = 0; i < dim; i++) {
for (k = 0; k < dim; k++) { //<- exchanged for better
for (j = 0; j < dim; j++) { //<- memory access
C[i][j] += (A[i][k] * B[k][j]);
}
}
}
return MM_OK;
}
// helper for mat_mul_strassen, only square matrices of same size
int mat_add(double **A, double **B, double **C, int dim)
{
int i, j;
if (C == NULL) {
fprintf(stderr, "C == NULL in mat_add\n");
return MM_ERR;
}
for (i = 0; i < dim; i++) {
for (j = 0; j < dim; j++) {
C[i][j] = A[i][j] + B[i][j];
}
}
return MM_OK;
}
// helper for mat_mul_strassen, only square matrices of same size
int mat_sub(double **A, double **B, double **C, int dim)
{
int i, j;
if (C == NULL) {
fprintf(stderr, "C == NULL in mat_sub\n");
return MM_ERR;
}
for (i = 0; i < dim; i++) {
for (j = 0; j < dim; j++) {
C[i][j] = A[i][j] - B[i][j];
}
}
return MM_OK;
}
// This implementation is from Wikipedia
// https://en.wikipedia.org/w/index.php?title=Strassen_algorithm&oldid=498910018#Source_code_of_the_Strassen_algorithm_in_C_language
// And it is a really slow one.
int mat_mul_strassen(double **A, double **B, double **C, int dim)
{
if (dim == 1) {
C[0][0] = A[0][0] * B[0][0];
return MM_OK;
}
#ifdef MAKE_IT_ACTUALLY_WORK_FAST
// 256 on my machine, YMMV
else if (dim <= 256) {
return mat_mul_naive(A, B, C, dim);
}
#endif
else {
int new_dim = dim / 2;
double **a11, **a12, **a21, **a22;
double **b11, **b12, **b21, **b22;
double **c11, **c12, **c21, **c22;
double **p1, **p2, **p3, **p4, **p5, **p6, **p7;
double **aResult;
double **bResult;
int i, j;
a11 = mat_init(new_dim);
a12 = mat_init(new_dim);
a21 = mat_init(new_dim);
a22 = mat_init(new_dim);
b11 = mat_init(new_dim);
b12 = mat_init(new_dim);
b21 = mat_init(new_dim);
b22 = mat_init(new_dim);
c11 = mat_init(new_dim);
c12 = mat_init(new_dim);
c21 = mat_init(new_dim);
c22 = mat_init(new_dim);
p1 = mat_init(new_dim);
p2 = mat_init(new_dim);
p3 = mat_init(new_dim);
p4 = mat_init(new_dim);
p5 = mat_init(new_dim);
p6 = mat_init(new_dim);
p7 = mat_init(new_dim);
aResult = mat_init(new_dim);
bResult = mat_init(new_dim);
// the divide part of divide&conquer
for (i = 0; i < new_dim; i++) {
for (j = 0; j < new_dim; j++) {
a11[i][j] = A[i][j];
a12[i][j] = A[i][j + new_dim];
a21[i][j] = A[i + new_dim][j];
a22[i][j] = A[i + new_dim][j + new_dim];
b11[i][j] = B[i][j];
b12[i][j] = B[i][j + new_dim];
b21[i][j] = B[i + new_dim][j];
b22[i][j] = B[i + new_dim][j + new_dim];
}
}
// Calculating p1 to p7:
mat_add(a11, a22, aResult, new_dim); // a11 + a22
mat_add(b11, b22, bResult, new_dim); // b11 + b22
mat_mul_strassen(aResult, bResult, p1, new_dim); // p1 = (a11+a22) * (b11+b22)
mat_add(a21, a22, aResult, new_dim); // a21 + a22
mat_mul_strassen(aResult, b11, p2, new_dim); // p2 = (a21+a22) * (b11)
mat_sub(b12, b22, bResult, new_dim); // b12 - b22
mat_mul_strassen(a11, bResult, p3, new_dim); // p3 = (a11) * (b12 - b22)
mat_sub(b21, b11, bResult, new_dim); // b21 - b11
mat_mul_strassen(a22, bResult, p4, new_dim); // p4 = (a22) * (b21 - b11)
mat_add(a11, a12, aResult, new_dim); // a11 + a12
mat_mul_strassen(aResult, b22, p5, new_dim); // p5 = (a11+a12) * (b22)
mat_sub(a21, a11, aResult, new_dim); // a21 - a11
mat_add(b11, b12, bResult, new_dim); // b11 + b12
mat_mul_strassen(aResult, bResult, p6, new_dim); // p6 = (a21-a11) * (b11+b12)
mat_sub(a12, a22, aResult, new_dim); // a12 - a22
mat_add(b21, b22, bResult, new_dim); // b21 + b22
mat_mul_strassen(aResult, bResult, p7, new_dim); // p7 = (a12-a22) * (b21+b22)
// calculating c21, c21, c11 e c22:
mat_add(p3, p5, c12, new_dim); // c12 = p3 + p5
mat_add(p2, p4, c21, new_dim); // c21 = p2 + p4
mat_add(p1, p4, aResult, new_dim); // p1 + p4
mat_add(aResult, p7, bResult, new_dim); // p1 + p4 + p7
mat_sub(bResult, p5, c11, new_dim); // c11 = p1 + p4 - p5 + p7
mat_add(p1, p3, aResult, new_dim); // p1 + p3
mat_add(aResult, p6, bResult, new_dim); // p1 + p3 + p6
mat_sub(bResult, p2, c22, new_dim); // c22 = p1 + p3 - p2 + p6
// Grouping the results obtained in a single matrix:
for (i = 0; i < new_dim; i++) {
for (j = 0; j < new_dim; j++) {
C[i][j] = c11[i][j];
C[i][j + new_dim] = c12[i][j];
C[i + new_dim][j] = c21[i][j];
C[i + new_dim][j + new_dim] = c22[i][j];
}
}
mat_clear(a11, new_dim);
mat_clear(a12, new_dim);
mat_clear(a21, new_dim);
mat_clear(a22, new_dim);
mat_clear(b11, new_dim);
mat_clear(b12, new_dim);
mat_clear(b21, new_dim);
mat_clear(b22, new_dim);
mat_clear(c11, new_dim);
mat_clear(c12, new_dim);
mat_clear(c21, new_dim);
mat_clear(c22, new_dim);
mat_clear(p1, new_dim);
mat_clear(p2, new_dim);
mat_clear(p3, new_dim);
mat_clear(p4, new_dim);
mat_clear(p5, new_dim);
mat_clear(p6, new_dim);
mat_clear(p7, new_dim);
mat_clear(aResult, new_dim);
mat_clear(bResult, new_dim);
}
return MM_OK;
}
static int ispow2(unsigned int x)
{
while (((x & 1) == 0) && x > 1) {
x >>= 1;
}
if (x == 1) {
return 1;
}
return 0;
}
#include <time.h>
int main(int argc, char **argv)
{
//int res;
int dim1;
double **A, **B, **C, **D;
clock_t start, stop;
// only square matrices of same size
if (argc != 2) {
fprintf(stderr, "Usage: %s dimension\n", argv[0]);
exit(EXIT_FAILURE);
}
dim1 = atoi(argv[1]);
if (!ispow2(dim1)) {
fprintf(stderr,
"Dimension of matrix must be a power of two and %d is not\n", dim1);
exit(EXIT_FAILURE);
}
A = mat_rand(dim1, 123);
B = mat_rand(dim1, 456);
//mat_print(A, dim1);
//mat_print(B, dim1);
C = mat_init(dim1);
start = clock();
mat_mul_naive(A, B, C, dim1);
stop = clock();
printf("Naive %.10f seconds\n", (double) (stop - start) / CLOCKS_PER_SEC);
//mat_print(C, dim1);
D = mat_init(dim1);
start = clock();
mat_mul_strassen(A, B, D, dim1);
stop = clock();
printf("Strassen %.10f seconds\n", (double) (stop - start) / CLOCKS_PER_SEC);
//mat_print(D, dim1);
mat_sub(C, D, C, dim1);
// results will differ more and more the larger the matrix is
//mat_print(C, dim1);
mat_clear(A, dim1);
mat_clear(B, dim1);
mat_clear(C, dim1);
mat_clear(D, dim1);
exit(EXIT_SUCCESS);
}
solved How to multiply two matrices having fractions as inputs in c [closed]