Hello,<br><br>I&#39;m new to parallel programming and MPI.&nbsp; I&#39;ve developed a simulator in C++ for which I would<br>like to decrease the running time by using a Beowulf cluster.&nbsp; I&#39;m not interested in optimizing <br>

speed, I&#39;m just looking for a quick and easy way to significantly improve the speed over running<br>the program on a single machine.<br>Basically, I need to learn how to parallelize a C++ function.&nbsp; The following functions in particular

<br>take the longest to run in my simulator.&nbsp; The first implements LU decomposition on a large matrix and <br>the second implements the backsubstitution method to solve matrix division.<br><br>void NR::ludcmp(Mat_IO_DP &amp;a, Vec_O_INT &amp;indx, DP &amp;d)

<br>{<br>&nbsp;&nbsp; &nbsp;const DP TINY=1.0e-20;<br>&nbsp;&nbsp; &nbsp;int i,imax,j,k;<br>&nbsp;&nbsp; &nbsp;DP big,dum,sum,temp;<br><br>&nbsp;&nbsp; &nbsp;int n=a.nrows();<br>&nbsp;&nbsp; &nbsp;Vec_DP vv(n);<br>&nbsp;&nbsp; &nbsp;d=1.0;<br>&nbsp;&nbsp; &nbsp;for (i=0;i&lt;n;i++) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;big=0.0;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (j=0;j&lt;n;j++)

<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;if ((temp=fabs(a[i][j])) &gt; big) big=temp;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;if (big == 0.0) nrerror(&quot;Singular matrix in routine ludcmp&quot;);<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;vv[i]=1.0/big;<br>&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;for (j=0;j&lt;n;j++) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (i=0;i&lt;j;i++) {

<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;sum=a[i][j];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (k=0;k&lt;i;k++) sum -= a[i][k]*a[k][j];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;a[i][j]=sum;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;big=0.0;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (i=j;i&lt;n;i++) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;sum=a[i][j];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (k=0;k&lt;j;k++) sum -= a[i][k]*a[k][j];

<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;a[i][j]=sum;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;if ((dum=vv[i]*fabs(sum)) &gt;= big) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;big=dum;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;imax=i;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;if (j != imax) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (k=0;k&lt;n;k++) {

<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;dum=a[imax][k];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;a[imax][k]=a[j][k];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;a[j][k]=dum;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;d = -d;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;vv[imax]=vv[j];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;indx[j]=imax;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;if (a[j][j] == 

0.0) a[j][j]=TINY;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;if (j != n-1) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;dum=1.0/(a[j][j]);<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (i=j+1;i&lt;n;i++) a[i][j] *= dum;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;}<br>}<br><br><br><br>and...<br><br><br><br>void NR::lubksb(Mat_I_DP &amp;a, Vec_I_INT &amp;indx, Vec_IO_DP &amp;b)

<br>{<br>&nbsp;&nbsp; &nbsp;int i,ii=0,ip,j;<br>&nbsp;&nbsp; &nbsp;DP sum;<br><br>&nbsp;&nbsp; &nbsp;int n=a.nrows();<br>&nbsp;&nbsp; &nbsp;for (i=0;i&lt;n;i++) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;ip=indx[i];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;sum=b[ip];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;b[ip]=b[i];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;if (ii != 0)<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (j=ii-1;j&lt;i;j++) sum -= a[i][j]*b[j];

<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;else if (sum != 0.0)<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;ii=i+1;<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;b[i]=sum;<br>&nbsp;&nbsp; &nbsp;}<br>&nbsp;&nbsp; &nbsp;for (i=n-1;i&gt;=0;i--) {<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;sum=b[i];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;for (j=i+1;j&lt;n;j++) sum -= a[i][j]*b[j];<br>&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;b[i]=sum/a[i][i];

<br>&nbsp;&nbsp; &nbsp;}<br>}<br><br>(The functions are borrowed from the library provided by &quot;Numerical Recipes in C++&quot;)<br>I&#39;m currently calling these functions from the main loop with the lines:<br><br>NR::ludcmp(c,indx,d);

<br><br>and<br><br>NR::lubksb(c,indx,xv);<br><br>where the variable &#39;c&#39; is a large matrix (holding image pixel values) and &#39;xv&#39; is a vector <br>used in backsubstitution.&nbsp; <br>All of the variables passed into these functions are of types defined in &quot;

nr.h&quot;<br>&#39;c&#39; is a Mat_DP&nbsp; (double-precision matrix)<br>&#39;indx&#39; is a Vec_INT&nbsp; (integer vector)<br>&#39;d&#39; is a DP&nbsp;&nbsp;&nbsp; (double precision)<br>&#39;xv&#39; is a Vec_DP&nbsp; (double precision vector)<br><br>

Is there a simple way to call these functions which will cause the cluster to distribute the <br>load of operations?&nbsp; Currently, when I run the program with a 50x50 array on a single machine, <br>it takes about 5 minutes to process a single iteration through the matrix division.

<br><br>Any help would be greatly appreciated. <br>

!DSPAM:469ca2c1224468298414181!