Saturday, April 26, 2008

octave: speed optimization - fix matrix looping (080426)

with or without following commands on following octave codes will effect obviously different execution time. from with/without preallocated memory, changing looping sequence, vectorizing, until using specific function will produced different execution time. i have eight cases to be compared, they are:
  • case 1: without preallocated memory
for i = 1:N
m1(i, 1:P) = i * ones(1, P);
endfor
elapsed time is 39.3089 seconds.
  • case 2: preallocated memory with 'm = [];'
m2 = [];
for i = 1:N
m2(i, 1:P) = i * ones(1, P);
endfor
elapsed time is 38.7618 seconds.
  • case 3: preallocated memory with 'm(N, P) = 1;'
m3(N, P) = 1;
for i = 1:N
m3(i, 1:P) = i * ones(1, P);
endfor
elapsed time is 0.126078 second.
  • case 4: preallocated memory with 'm = ones(N, P);'
m4 = ones(N, P);
for i = 1:N
m4(i, 1:P) = i * ones (1, P);
endfor
elapsed time is 0.082219 second.
  • case 5: changing the looping sequence from 'for i = 1:N' to 'for i = N:-1:1'
thanks to ben abbott for his answer to my question related to this optimization. he also suggest me to compare with following code:
m5 = [];
for i = N:-1:1
m5(i, 1:P) = i * ones (1, P);
endfor
elapsed time is 0.130076 second.


thanks to przemek klosowski for following new cases (case 6 to 8) and idea to add the conclusion.
  • case 6: nested or more looping with preallocated memory
m6 = ones(N, P);
for i = N:-1:1
for j = P:-1:1
m6(i, j) = i;
endfor
endfor
elapsed time is 15.0053 seconds.
  • case 7: vectorizing using looping
m7 = ones(N, P);
for j = 1:P
m7(:,j) = 1:N;
endfor
elapsed time is 0.053149 second.
  • case 8: vectorizing using 'repmat' function
m8 = repmat((1:N)', 1, P);
elapsed time is 0.045707 second.
  • conclusion
  1. from case 1-4: one way to speed up looping execution is with preallocated memory.
  2. from case 5: another way to speed up looping execution is with changing the looping sequence.
  3. from case 6: looping is very time consuming. so, avoid of using looping with other commands. this can be different from case to case.
  4. from case 7: another way to speed up looping execution is with vectorizing. vectorizing can be faster than preallocated memory or changing the looping sequence.
  5. from case 8: another way to speed up looping execution is using specific function. using specific function can be faster than vectorizing, preallocated memory, or changing the looping sequence.
  • overall test code
# Copyright (C) 2008 by lain.ux
# l411v.ux@gmail.com
# www.l411v.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

# ------------------------------------------------[ define matrix size ]--
N = 900;
P = 1800;
printf("matrix size: m(%d,%d)\n", N, P);

# -------------------------------------------[ display result function ]--
function display_result(m, N, P)
printf("result:\n", N, P);

# checking first column (expected: 1, 1, 1, ..., 1)
printf("m[1,1] = %d\t", m(1,1));
printf("m[1,2] = %d\t", m(1,2));
printf("m[1,3] = %d\t", m(1,3));
printf("m[1,%d] = %d\n", P, m(1,P));

# checking second column (expected: 2, 2, 2, ..., 2)
printf("m[2,1] = %d\t", m(2,1));
printf("m[2,2] = %d\t", m(2,2));
printf("m[2,3] = %d\t", m(2,3));
printf("m[2,%d] = %d\n", P, m(2,P));

# checking Nth column (expected: 900, 900, 900, ..., 900)
printf("m[%d,1] = %d\t", N, m(N,1));
printf("m[%d,2] = %d\t", N, m(N,2));
printf("m[%d,3] = %d\t", N, m(N,3));
printf("m[%d,%d] = %d\n", N, P, m(N,P));
end

# each column of matrix is the sequence 1, 2, ..., N
# ------------------------------------------------------------[ case 1 ]--
printf("\ncase 1: without preallocated memory\n");
tic
for i = 1:N
m1(i, 1:P) = i * ones(1, P);
endfor
toc
display_result(m1, N, P);

# ------------------------------------------------------------[ case 2 ]--
printf("\ncase 2: preallocated memory with 'm = [];'\n");
tic
m2 = [];
for i = 1:N
m2(i, 1:P) = i * ones(1, P);
endfor
toc
display_result(m2, N, P);

# ------------------------------------------------------------[ case 3 ]--
printf("\ncase 3: preallocated memory with 'm(N, P) = 1;'\n");
tic
m3(N, P) = 1;
for i = 1:N
m3(i, 1:P) = i * ones(1, P);
endfor
toc
display_result(m3, N, P);

# ------------------------------------------------------------[ case 4 ]--
printf("\ncase 4: preallocated memory with 'm = ones(N, P);'\n");
tic
m4 = ones(N, P);
for i = 1:N
m4(i, 1:P) = i * ones (1, P);
endfor
toc
display_result(m4, N, P);

# ------------------------------------------------------------[ case 5 ]--
printf("\ncase 5: changing the looping sequence ");
printf("from 'for i = 1:N' to 'for i = N:-1:1'\n");
tic
m5 = [];
for i = N:-1:1
m5(i, 1:P) = i * ones (1, P);
endfor
toc
display_result(m5, N, P);

# ------------------------------------------------------------[ case 6 ]--
printf("\ncase 6: nested or more looping with preallocated memory\n");
tic
m6 = ones(N, P);
for i = N:-1:1
for j = P:-1:1
m6(i, j) = i;
endfor
endfor
toc
display_result(m6, N, P);

# ------------------------------------------------------------[ case 7 ]--
printf("\ncase 7: vectorizing using looping\n");
tic
m7 = ones(N, P);
for j = 1:P
m7(:,j) = 1:N;
endfor
toc
display_result(m7, N, P);

# ------------------------------------------------------------[ case 8 ]--
printf("\ncase 8: vectorizing using 'repmat' function\n");
tic
m8 = repmat((1:N)', 1, P);
toc
display_result(m8, N, P);
  • execution result
$ octave-3.0.1 -q trick_080426.octave
matrix size: m(900,1800)

case 1: without preallocated memory
Elapsed time is 39.3089 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

case 2: preallocated memory with 'm = [];'
Elapsed time is 38.7618 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

case 3: preallocated memory with 'm(N, P) = 1;'
Elapsed time is 0.126078 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

case 4: preallocated memory with 'm = ones(N, P);'
Elapsed time is 0.082219 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

case 5: changing the looping sequence from 'for i = 1:N' to 'for i = N:-1:1'
Elapsed time is 0.130076 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

case 6: nested or more looping with preallocated memory
Elapsed time is 15.0053 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

case 7: vectorizing using looping
Elapsed time is 0.053149 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

case 8: vectorizing using 'repmat' function
Elapsed time is 0.045707 seconds.
result:
m[1,1] = 1 m[1,2] = 1 m[1,3] = 1 m[1,1800] = 1
m[2,1] = 2 m[2,2] = 2 m[2,3] = 2 m[2,1800] = 2
m[900,1] = 900 m[900,2] = 900 m[900,3] = 900 m[900,1800] = 900

Friday, April 18, 2008

openmp c/c++ on intel and linux using sun compiler

i had written another openmp topic titled openmp c/c++ on intel and linux using intel compiler. this blog is about how to do it again using sun compiler. following is the steps:
  • setting sunstudio
sunstudio is a development packet from sun microsystems. it contains netbeans ide and sun c compiler. we will need both of them to create the project and compile the code. my another blog titled sun studio on linux describes how to set up the sunstudio on debian gnu/linux. at the end of the blog, i give two links to sunstudio documentations on how to use it.
  • getting source code
i use a same source code as i use it on intel compiler. please find the source code from another blog titled openmp c/c++ on intel and linux using intel compiler on sample code bulleted list.
  • compiling source code
after setting up the project and loading the source code in, follow following steps to activate openmp feature during compilation process.
    • open project properties window: on project window, right click on project name, and select properties.
    • activate openmp feature: on categories tree box, select configuration properties - c/c++/fortran - c compiler - general. on properties table box, change multithreading level form none to openmp. click on ok button.
    • build the project: on menu bar, click on build - build main project.
activate openmp compilation feature
  • executing the binary file
you need to set maximum stack size with following command. i still don't know how to run the following command on the sunstudio ide before i execute the binary file. so i execute the binary file on console.
$ ulimit -s unlimited

you need also to set number of thread to use. i use intel core 2 duo processor. it's dual processor so i set it with 2.
$ export OMP_NUM_THREADS=2
$ export OMP_DYNAMIC=FALSE

i name the binary file with openmp feature, with sun_openmp. following is the execution result.
$ ./sun_openmp
Using time() for wall clock time
Problem size: c(600,2400) = a(600,1200) * b(1200,2400)
Calculating product 5 time(s)

We are using 2 thread(s)

Finished calculations.
Matmul kernel wall clock time = 6.00 sec
Wall clock time/thread = 3.00 sec
MFlops = 2880.000000

i name the binary file without openmp feature, with sun_not_openmp. following is the execution result.
$ ./sun_not_openmp
Using time() for wall clock time
Problem size: c(600,2400) = a(600,1200) * b(1200,2400)
Calculating product 5 time(s)

We are using 1 thread(s)

Finished calculations.
Matmul kernel wall clock time = 17.00 sec
Wall clock time/thread = 17.00 sec
MFlops = 1016.470588
  • system monitor
following captured kde system guard (performance monitor) shows cpu load of cpu0 and cpu1. both numbers on purple color describe:
(1) single processor working: only cpu0 is active 100%, the process is finished slower
(2) dual processors working: both cpu0 and cpu1 are active 100%, the process is finished faster

  • openmp api user's guide
for more deep understanding, please reefer to sun studio 12: openmp api user's guide.

Wednesday, April 2, 2008

openmp c/c++ on intel and linux using intel compiler

i'm using intel core 2 duo processor and debian gnu/linux 4.0. i'm now, in the step of learning openmp and i want to know how fast dual processors can run compared to single processor.
  • hardware and software specification
following processor information is taken from /proc/cpuinfo file.
vendor_id       : GenuineIntel
cpu family : 6
model : 15
model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
stepping : 6
cpu MHz : 2400.778
cache size : 4096 KB
physical id : 0
siblings : 2
cpu cores : 2

i'm using debian gnu/linux 4.0.
$ uname -a
Linux l411v 2.6.18-6-686 #1 SMP Sun Feb 10 22:11:31 UTC 2008 i686 GNU/Linux

i get and install intel c++ compiler professional edition for linux for non-commercial use.
$ icc --version
icc (ICC) 10.1 20080112
Copyright (C) 1985-2007 Intel Corporation. All rights reserved.
  • setting environment
set executable path to intel c++ compiler binary directory.
$ export PATH=$PATH:/usr/share/intel/cc/10.1.012/bin/

set library path to intel c++ dynamic library directory.
$ export LD_LIBRARY_PATH=/usr/share/intel/cc/10.1.012/lib
$ echo $LD_LIBRARY_PATH
/usr/share/intel/cc/10.1.012/lib
  • fixing bug
i get openmp sample code (openmp_sample.c) after installing intel c++ compiler from sample directory. please find the sample code below. to activate openmp feature, add additional -openmp compiling parameter. i found some errors when compiling the sample code without -openmp compiling parameter. the errors as follow:
$ icc -std=c99 openmp_sample.c
openmp_sample.c(106): warning #161: unrecognized #pragma
#pragma omp parallel private(i,j,k)
^

openmp_sample.c(109): warning #161: unrecognized #pragma
#pragma omp single nowait
^

openmp_sample.c(119): warning #161: unrecognized #pragma
#pragma omp for nowait
^

openmp_sample.c(126): warning #161: unrecognized #pragma
#pragma omp for nowait
^

what i need to do just adding preprocessor to block all openmp features on the sample code. so change from:
  #pragma omp parallel private(i,j,k)
to:
#ifdef _OPENMP
#pragma omp parallel private(i,j,k)
#endif
  • compile and run
ulimit command controls the user resources available to a process started by the shell. you need set the stack size to an appropriate size; otherwise, the application will generate a segmentation fault. following command sets the maximum stack size to unlimited.
$ ulimit -s unlimited

compile with openmp feature and static linking mode:
$ icc -std=c99 -openmp -static openmp_sample.c
openmp_sample.c(119): (col. 5) remark: OpenMP DEFINED LOOP WAS PARALLELIZED.
openmp_sample.c(126): (col. 5) remark: OpenMP DEFINED LOOP WAS PARALLELIZED.
openmp_sample.c(106): (col. 3) remark: OpenMP DEFINED REGION WAS PARALLELIZED.

$ ls -l
total 1092
-rwxr-xr-x 1 lain lain 1105135 2008-04-02 11:48 a.out
-rw-r--r-- 1 lain lain 4702 2008-04-02 11:06 openmp_sample.c

$ ./a.out

Using time() for wall clock time
Problem size: c(600,2400) = a(600,1200) * b(1200,2400)
Calculating product 5 time(s)

We are using 2 thread(s)

Finished calculations.
Matmul kernel wall clock time = 6.00 sec
Wall clock time/thread = 3.00 sec
MFlops = 2880.000000

compile with openmp feature and dynamic linking mode:
$ icc -std=c99 -openmp openmp_sample.c
openmp_sample.c(124): (col. 5) remark: OpenMP DEFINED LOOP WAS PARALLELIZED.
openmp_sample.c(133): (col. 5) remark: OpenMP DEFINED LOOP WAS PARALLELIZED.
openmp_sample.c(107): (col. 3) remark: OpenMP DEFINED REGION WAS PARALLELIZED.

$ ls -l
total 44
-rwxr-xr-x 1 lain lain 34087 2008-04-02 12:01 a.out
-rw-r--r-- 1 lain lain 4702 2008-04-02 11:06 openmp_sample.c

$ ./a.out
Using time() for wall clock time
Problem size: c(600,2400) = a(600,1200) * b(1200,2400)
Calculating product 5 time(s)

We are using 2 thread(s)

Finished calculations.
Matmul kernel wall clock time = 6.00 sec
Wall clock time/thread = 3.00 sec
MFlops = 2880.000000

compile without openmp feature and static linking mode:
$ icc -std=c99 -static openmp_sample.c

$ ls -l
total 512
-rwxr-xr-x 1 lain lain 509458 2008-04-02 11:49 a.out
-rw-r--r-- 1 lain lain 4702 2008-04-02 11:06 openmp_sample.c

$ ./a.out
Using time() for wall clock time
Problem size: c(600,2400) = a(600,1200) * b(1200,2400)
Calculating product 5 time(s)

We are using 1 thread(s)

Finished calculations.
Matmul kernel wall clock time = 17.00 sec
Wall clock time/thread = 17.00 sec
MFlops = 1016.470588

compile without openmp feature and dynamic linking mode:
$ icc -std=c99 openmp_sample.c

$ ls -l
total 32
-rwxr-xr-x 1 lain lain 23570 2008-04-02 12:03 a.out
-rw-r--r-- 1 lain lain 4702 2008-04-02 11:06 openmp_sample.c

$ ./a.out
Using time() for wall clock time
Problem size: c(600,2400) = a(600,1200) * b(1200,2400)
Calculating product 5 time(s)


We are using 1 thread(s)

Finished calculations.
Matmul kernel wall clock time = 17.00 sec
Wall clock time/thread = 17.00 sec
MFlops = 1016.470588
  • conclusion
| openmp  | number of | linking | file size |  time to  |    mega      |
| feature | thread | mode | (byte) | finish | flops |
| | | | | (seconds) | |
+---------+-----------+---------+-----------+-----------+--------------+
| yes | 2 | static | 1,105,135 | 6 | 2,880.000000 |
| yes | 2 | dynamic | 34,087 | 6 | 2,880.000000 |
| no | 1 | static | 509,458 | 17 | 1,016.470588 |
| no | 1 | dynamic | 23,570 | 17 | 1,016.470588 |
  • system monitor
following captured kde system guard (performance monitor) shows cpu load of cpu0 and cpu1. both numbers on purple color describe:
(1) single processor working: only cpu0 is active 100%, the process is finished slower
(2) dual processors working: both cpu0 and cpu1 are active 100%, the process is finished faster

  • sample code
openmp_sample.c file:
/*
* Copyright (C) 2006-2007 Intel Corporation. All Rights Reserved.
*
* The source code contained or described herein and all
* documents related to the source code ("Material") are owned by
* Intel Corporation or its suppliers or licensors. Title to the
* Material remains with Intel Corporation or its suppliers and
* licensors. The Material is protected by worldwide copyright
* laws and treaty provisions. No part of the Material may be
* used, copied, reproduced, modified, published, uploaded,
* posted, transmitted, distributed, or disclosed in any way
* except as expressly provided in the license provided with the
* Materials. No license under any patent, copyright, trade
* secret or other intellectual property right is granted to or
* conferred upon you by disclosure or delivery of the Materials,
* either expressly, by implication, inducement, estoppel or
* otherwise, except as expressly provided in the license
* provided with the Materials.
*
* [DESCRIPTION]
* Each element of the product matrix c[i][j] is
* computed from a unique row and
* column of the factor matrices, a[i][k] and b[k][j].
*
* In the multithreaded implementation, each thread can
* concurrently compute some submatrix of the product without
* needing OpenMP data or control synchronization.
*
* The algorithm uses OpenMP* to parallelize the outer-most loop,
* using the "i" row index.
*
* Both the outer-most "i" loop and middle "k" loop are manually
* unrolled by 4. The inner-most "j" loop iterates one-by-one
* over the columns of the product and factor matrices.
*
* [COMPILE]
* Use the following compiler options to compile both multi- and
* single-threaded versions.
*
* Parallel compilation:
* You must set the stacksize to an appropriate size; otherwise,
* the application will generate a segmentation fault.
* Linux* and Mac OS* X: appropriate ulimit commands are shown for
* bash shell.
*
* Windows*: /Qstd=c99 /Qopenmp /F256000000
*
* Linux*: ulimit -s unlimited
* -std=c99 -openmp
*
* Mac OS* X: ulimit -s 64000
* -std=c99 -openmp
*
* Serial compilation:
*
* Use the same command, but omit the -openmp (Linux and Mac OS X)
* or /Qopenmp (Windows) option.
*
*/

#include <stdio.h>
#include <time.h>
#include <float.h>
#include <math.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#define bool _Bool
#define true 1
#define false 0

// Matrix size constants
// Be careful to set your shell's stacksize limit to a high value if you
// wish to increase the SIZE.
#define SIZE 4800 // Must be a multiple of 8.
#define M SIZE/8
#define N SIZE/4
#define P SIZE/2
#define NTIMES 5 // product matrix calculations

int main(void)
{
double a[M][N], b[N][P], c[M][P], walltime;
bool nthr_checked=false;
time_t start;

int i, j, k, l, i1, i2, i3, k1, k2, k3, nthr=1;

printf("Using time() for wall clock time\n");
printf("Problem size: c(%d,%d) = a(%d,%d) * b(%d,%d)\n",
M, P, M, N, N, P);
printf("Calculating product %d time(s)\n", NTIMES);

// a is identity matrix
for (i=0; i<M; i++)
for (j=0; j<N; j++)
a[i][j] = 1.0;

// each column of b is the sequence 1,2,...,N
for (i=0; i<N; i++)
for (j=0; j<P; j++)
b[i][j] = i+1.;

start = time(NULL);

#ifdef _OPENMP
#pragma omp parallel private(i,j,k)
#endif
{
for (l=0; l<NTIMES; l++) {
#ifdef _OPENMP
#pragma omp single nowait
#endif
if (!nthr_checked) {
#ifdef _OPENMP
nthr = omp_get_num_threads();
#endif
printf( "\nWe are using %d thread(s)\n", nthr);
nthr_checked = true;
}

// Initialize product matrix
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (i=0; i<M; i++)
for (j=0; j<P; j++)
c[i][j] = 0.0;

// Parallelize by row. The threads don't need to synchronize at
// loop end, so "nowait" can be used.
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (i=0; i<M; i++) {
for (k=0; k<N; k++) {
// Each element of the product is just the sum 1+2+...+n
for (j=0; j<P; j++) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
} // #pragma omp parallel private(i,j,k)
} // l=0,...NTIMES-1

walltime = time(NULL) - start;
printf("\nFinished calculations.\n");
printf("Matmul kernel wall clock time = %.2f sec\n", walltime);
printf("Wall clock time/thread = %.2f sec\n", walltime/nthr);
printf("MFlops = %f\n",
(double)(NTIMES)*(double)(N*M*2)*(double)(P)/walltime/1.0e6);

return 0;
}