Recursion Unrolling

Document Sample

```					        Recursion Unrolling
for Divide and Conquer Programs

Laboratory for Computer Science
Massachusetts Institute of Technology

• Automatic generation of efficient
large base cases for divide and
conquer programs
Outline
1. Motivating Example
2. Computation Structure
3. Transformations
4. Related Work
5. Conclusion
1. Motivating Example
Divide and Conquer Matrix Multiply

A                B        =             R
A0       A1       B0       B1       A0B0+A1B2 A0B1+A1B3
                 =
A2       A3       B2       B3       A2B0+A3B2 A2B1+A3B3

• Divide matrices into sub-matrices: A0 , A1, A2 etc
• Use blocked matrix multiply equations
Divide and Conquer Matrix Multiply

A                B        =             R
A0       A1       B0       B1       A0B0+A1B2 A0B1+A1B3
                 =
A2       A3       B2       B3       A2B0+A3B2 A2B1+A3B3

• Recursively multiply sub-matrices
Divide and Conquer Matrix Multiply

A           B     =             R

a0          b0    =           a0  b0

• Terminate recursion with a simple base case
Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {  Implements   R += A  B
if (n == 1) {
(*R) += (*A) * (*B);
} else {
matmul(A, B, R, n/4);
matmul(A, B+(n/4), R+(n/4), n/4);
matmul(A+2*(n/4), B, R+2*(n/4), n/4);
matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
matmul(A+(n/4), B+2*(n/4), R, n/4);
matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
}
Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {        Divide matrices in
if (n == 1) {                                      sub-matrices and
(*R) += (*A) * (*B);                            recursively multiply
} else {
sub-matrices
matmul(A, B, R, n/4);
matmul(A, B+(n/4), R+(n/4), n/4);
matmul(A+2*(n/4), B, R+2*(n/4), n/4);
matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
matmul(A+(n/4), B+2*(n/4), R, n/4);
matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
}
Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {      Identify sub-matrices
if (n == 1) {                                    with pointers
(*R) += (*A) * (*B);
} else {
matmul(A, B, R, n/4);
matmul(A, B+(n/4), R+(n/4), n/4);
matmul(A+2*(n/4), B, R+2*(n/4), n/4);
matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
matmul(A+(n/4), B+2*(n/4), R, n/4);
matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
}
Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {        Use a simple
if (n == 1) {                                      algorithm for the
(*R) += (*A) * (*B);                            base case
} else {
matmul(A, B, R, n/4);
matmul(A, B+(n/4), R+(n/4), n/4);
matmul(A+2*(n/4), B, R+2*(n/4), n/4);
matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
matmul(A+(n/4), B+2*(n/4), R, n/4);
matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
}
Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {        • Advantage of
if (n == 1) {                                        small base case:
(*R) += (*A) * (*B);
simplicity
} else {
matmul(A, B, R, n/4);
matmul(A, B+(n/4), R+(n/4), n/4);               • Code is easy to:
matmul(A+2*(n/4), B, R+2*(n/4), n/4);              •   Write
matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
•   Maintain
matmul(A+(n/4), B+2*(n/4), R, n/4);
matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
•   Debug
matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);      •   Understand
matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
}
Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {      • Disadvantage:
if (n == 1) {                                      inefficiency
(*R) += (*A) * (*B);
} else {
matmul(A, B, R, n/4);                         • Large control flow
matmul(A+2*(n/4), B, R+2*(n/4), n/4);         • Most of the time is
matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);     spent in dividing the
matmul(A+(n/4), B+2*(n/4), R, n/4);             matrix in sub-matrices
matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
}
Hand Coded Implementation
void serialmul(block *As, block *Bs, block *Rs)            s0_0   +=   ap[5] * bp[80];                 s1_1 += ap[29] *   bp[209];
{                                                          s0_1   +=   ap[5] * bp[81];                 s0_0 += ap[14] *   bp[224];
int i, j;                                              s1_0   +=   ap[21] * bp[80];                s0_1 += ap[14] *   bp[225];
s1_1   +=   ap[21] * bp[81];                s1_0 += ap[30] *   bp[224];
DOUBLE *A = (DOUBLE *) As;                              s0_0   +=   ap[6] * bp[96];                 s1_1 += ap[30] *   bp[225];
DOUBLE *B = (DOUBLE *) Bs;                              s0_1   +=   ap[6] * bp[97];                 s0_0 += ap[15] *   bp[240];
DOUBLE *R = (DOUBLE *) Rs;                              s1_0   +=   ap[22] * bp[96];                s0_1 += ap[15] *   bp[241];
s1_1   +=   ap[22] * bp[97];                s1_0 += ap[31] *   bp[240];
for (j = 0; j < 16; j += 2) {                           s0_0   +=   ap[7] * bp[112];                s1_1 += ap[31] *   bp[241];
DOUBLE *bp = &B[j];                                 s0_1   +=   ap[7] * bp[113];                rp[0] = s0_0;
for (i = 0; i < 16; i += 2) {                       s1_0   +=   ap[23] * bp[112];               rp[1] = s0_1;
DOUBLE *ap = &A[i * 16];                        s1_1   +=   ap[23] * bp[113];               rp[16] = s1_0;
DOUBLE *rp = &R[j + i * 16];                    s0_0   +=   ap[8] * bp[128];                rp[17] = s1_1;
register DOUBLE s0_0 = rp[0], s0_1 = rp[1];     s0_1   +=   ap[8] * bp[129];            }
register DOUBLE s1_0 = rp[16], s1_1 = rp[17];   s1_0   +=   ap[24] * bp[128];       }
s0_0 += ap[0] * bp[0];                          s1_1   +=   ap[24] * bp[129];   }
s0_1 += ap[0] * bp[1];                          s0_0   +=   ap[9] * bp[144];
s1_0 += ap[16] * bp[0];                         s0_1   +=   ap[9] * bp[145];    cilk void matrixmul(long nb, block *A, block *B, block *R)
s1_1 += ap[16] * bp[1];                         s1_0   +=   ap[25] * bp[144];   {
s0_0 += ap[1] * bp[16];                         s1_1   +=   ap[25] * bp[145];       if (nb == 1) {
s0_1 += ap[1] * bp[17];                         s0_0   +=   ap[10] * bp[160];            flops = serialmul(A, B, R);
s1_0 += ap[17] * bp[16];                        s0_1   +=   ap[10] * bp[161];       } else if (nb >= 4) {
s1_1 += ap[17] * bp[17];                        s1_0   +=   ap[26] * bp[160];              spawn matrixmul(nb/4, A, B, R);
s0_0 += ap[2] * bp[32];                         s1_1   +=   ap[26] * bp[161];              spawn matrixmul(nb/4, A, B+(nb/4), R+(nb/4));
s0_1 += ap[2] * bp[33];                         s0_0   +=   ap[11] * bp[176];              spawn matrixmul(nb/4, A+2*(nb/4), B+(nb/4), R+2*(nb/4));
s1_0 += ap[18] * bp[32];                        s0_1   +=   ap[11] * bp[177];              spawn matrixmul(nb/4, A+2*(nb/4), B, R+3*(nb/4));
s1_1 += ap[18] * bp[33];                        s1_0   +=   ap[27] * bp[176];              sync;
s0_0 += ap[3] * bp[48];                         s1_1   +=   ap[27] * bp[177];              spawn matrixmul(nb/4, A+(nb/4), B+2*(nb/4), R);
s0_1 += ap[3] * bp[49];                         s0_0   +=   ap[12] * bp[192];              spawn matrixmul(nb/4, A+(nb/4), B+3*(nb/4), R+(nb/4));
s1_0 += ap[19] * bp[48];                        s0_1   +=   ap[12] * bp[193];              spawn matrixmul(nb/4, A+3*(nb/4), B+3*(nb/4), R+2*(nb/4));
s1_1 += ap[19] * bp[49];                        s1_0   +=   ap[28] * bp[192];              spawn matrixmul(nb/4, A+3*(nb/4), B+3*(nb/4), R+3*(nb/4));
s0_0 += ap[4] * bp[64];                         s1_1   +=   ap[28] * bp[193];              sync;
s0_1 += ap[4] * bp[65];                         s0_0   +=   ap[13] * bp[208];       }
s1_0 += ap[20] * bp[64];                        s0_1   +=   ap[13] * bp[209];   }
s1_1 += ap[20] * bp[65];                        s1_0   +=   ap[29] * bp[208];
Goal

• The programmer writes simple code
with small base cases

• The compiler automatically generates
efficient code with large base cases
2. Computation Structure
Running Example – Array Increment

void f(char *p, int n)
if (n == 1) {
/* base case: increment one element */
(*p) += 1;
} else {
f(p, n/2);        /* increment first half */
f(p+n/2, n/2);    /* increment second half */
}
}
Dynamic Call Tree for n=4
Execution of f(p,4)
Dynamic Call Tree for n=4
Execution of f(p,4)
Test n=1
Call f Call f
Dynamic Call Tree for n=4
Execution of f(p,4)
Activation Frame        Test n=1
on the Stack        Call f Call f
Dynamic Call Tree for n=4
Execution of f(p,4)
Test n=1          Executed
Call f Call f      Instructions
Dynamic Call Tree for n=4
Execution of f(p,4)
Test n=1
Call f Call f
Dynamic Call Tree for n=4
Execution of f(p,4)
Test n=1
n=4
Call f Call f

Test n=1                        Test n=1
n=2
Call f Call f                   Call f Call f
Dynamic Call Tree for n=4
Execution of f(p,4)
Test n=1
n=4
Call f Call f

Test n=1                        Test n=1
n=2
Call f Call f                   Call f Call f

Test n=1    Test n=1        Test n=1        Test n=1
n=1
Inc *p      Inc *p          Inc *p          Inc *p
Execution of f(p,4)

n=4
Call f Call f

Test n=1                        Test n=1
n=2
Call f Call f                   Call f Call f

Test n=1    Test n=1        Test n=1        Test n=1
n=1
Inc *p      Inc *p          Inc *p          Inc *p
Execution of f(p,4)

n=4
Test n=1                Call overhead +
Call f Call f              Test overhead

Test n=1                        Test n=1
n=2
Call f Call f                   Call f Call f

Test n=1    Test n=1        Test n=1        Test n=1
n=1
Inc *p      Inc *p          Inc *p          Inc *p
Computation
Execution of f(p,4)

n=4
Test n=1                Call overhead +
Call f Call f              Test overhead
   Computation

Test n=1                        Test n=1
n=2
Call f Call f                   Call f Call f

Test n=1    Test n=1        Test n=1        Test n=1
n=1
Inc *p      Inc *p          Inc *p          Inc *p
Large Base Cases = Reduced Overhead
Execution of f(p,4)
Test n=2
n=4
Call f Call f

Test n=2                     Test n=2
n=2
Inc *p                       Inc *p
Inc *(p+1)                   Inc *(p+1)
3. Transformations
Transformation 1: Recursion Inlining

void f (char *p, int n)
if (n == 1) {
(*p) += 1;
} else {
f(p, n/2);
f(p+n/2, n/2);
}
Transformation 1: Recursion Inlining
Make two copies of the original procedure

void f1(char *p, int n)     void f2(char *p, int n)
if (n == 1) {               if (n == 1) {
(*p) += 1;                  (*p) += 1;
} else {                    } else {
f1(p, n/2);                 f2(p, n/2);
f1(p+n/2, n/2);             f2(p+n/2, n/2);
}                           }
Transformation 1: Recursion Inlining
Transform direct recursion to mutual recursion

void f1(char *p, int n)     void f2(char *p, int n)
if (n == 1) {               if (n == 1) {
(*p) += 1;                  (*p) += 1;
} else {                    } else {
f2(p, n/2);                 f1(p, n/2);
f2(p+n/2, n/2);             f1(p+n/2, n/2);
}                           }
Transformation 1: Recursion Inlining
Inline procedure f2 at call sites in f1

void f1(char *p, int n)       void f2(char *p, int n)
if (n == 1) {                 if (n == 1) {
(*p) += 1;                    (*p) += 1;
} else {                      } else {
f2(p, n/2);                   f1(p, n/2);
f2(p+n/2, n/2);               f1(p+n/2, n/2);
}                             }
Transformation 1: Recursion Inlining
void f1(char *p, int n)
if (n == 1) {
(*p) += 1;
} else {
if (n/2 == 1) {
*p += 1;
} else {
f1(p, n/2/2);
f1(p+n/2/2, n/2/2);
}
if (n/2 == 1) {
*(p+n/2) += 1;
} else {
f1(p+n/2, n/2/2);
f1(p+n/2+n/4, n/2/2);
}
}
Transformation 1: Recursion Inlining
void f1(char *p, int n)
if (n == 1) {
• Reduced procedure
} else {
if (n/2 == 1) {
*p += 1;
• More code exposed
} else {                     at the intra-procedural
f1(p, n/2/2);             level
f1(p+n/2/2, n/2/2);
}
if (n/2 == 1) {            • Opportunities to
*(p+n/2) += 1;            simplify control flow
} else {                     in the inlined code
f1(p+n/2, n/2/2);
f1(p+n/2+n/4, n/2/2);
}
}
Transformation 1: Recursion Inlining
void f1(char *p, int n)
if (n == 1) {
• Reduced procedure
} else {
if (n/2 == 1) {
*p += 1;
• More code exposed
} else {                     at the intra-procedural
f1(p, n/2/2);             level
f1(p+n/2/2, n/2/2);
}
if (n/2 == 1) {            • Opportunities to
*(p+n/2) += 1;            simplify control flow
} else {                     in the inlined code:
f1(p+n/2, n/2/2);
• identical condition
f1(p+n/2+n/4, n/2/2);
}
expressions
}
Transformation 2: Conditional Fusion
Merge if statements with identical conditions

void f1(char *p, int n)
if (n == 1) {
*p += 1;
} else if (n/2 == 1) {
*p += 1;
*(p+n/2) += 1;
} else {
f1(p, n/2/2);
f1(p+n/2/2, n/2/2);
f1(p+n/2, n/2/2);
f1(p+n/2+n/4, n/2/2);
}
Transformation 2: Conditional Fusion
Merge if statements with identical conditions

void f1(char *p, int n)      • Reduced branching
if (n == 1) {                overhead and bigger
*p += 1;                  basic blocks
} else if (n/2 == 1) {
*p += 1;
• Larger base case for
*(p+n/2) += 1;
n/2 = 1
} else {
f1(p, n/2/2);
f1(p+n/2/2, n/2/2);
f1(p+n/2, n/2/2);
f1(p+n/2+n/4, n/2/2);
}
Unrolling Iterations
Repeatedly apply inlining and conditional fusion
void f1(char *p, int n)
if (n == 1) {
*p += 1;
} else if (n/2 == 1) {
*p += 1;
*(p+n/2) += 1;
} else {
f1(p, n/2/2);
f1(p+n/2/2, n/2/2);
f1(p+n/2, n/2/2);
f1(p+n/2+n/4, n/2/2);
}
Second Unrolling Iteration

void f1(char *p, int n)      void f2(char *p, int n)
if (n == 1) {                if (n == 1) {
*p += 1;                     *p += 1;
} else if (n/2 == 1) {       } else {
*p += 1;                     f2(p, n/2);
*(p+n/2) += 1;               f2(p+n/2, n/2);
} else {                     }
f1(p, n/2/2);
f1(p+n/2/2, n/2/2);
f1(p+n/2, n/2/2);
f1(p+n/2+n/4, n/2/2);
}
Second Unrolling Iteration

void f1(char *p, int n)      void f2(char *p, int n)
if (n == 1) {                if (n == 1) {
*p += 1;                     *p += 1;
} else if (n/2 == 1) {       } else {
*p += 1;                     f1(p, n/2);
*(p+n/2) += 1;               f1(p+n/2, n/2);
} else {                     }
f2(p, n/2/2);
f2(p+n/2/2, n/2/2);
f2(p+n/2, n/2/2);
f2(p+n/2+n/4, n/2/2);
}
Result of Second Unrolling Iteration

void f1(char *p, int n)
if (n == 1) {
*p += 1;                else {
} else if (n/2 == 1) {       f1(p, n/2/2/2);
*p += 1;                  f1(p+n/2/2/2, n/2/2/2);
*(p+n/2) += 1;            f1(p+n/2/2, n/2/2/2);
} else if (n/2/2 == 1) {     f1(p+n/2/2+n/2/2/2, n/2/2/2);
*p += 1;                  f1(p+n/2, n/2/2/2);
*(p+n/2/2) += 1;          f1(p+n/2+n/2/2/2, n/2/2/2);
*(p+n/2) += 1;            f1(p+n/2+n/2/2, n/2/2/2);
*(p+n/2+n/2/2) += 1;      f1(p+n/2+n/2/2+n/2/2/2, n/2/2/2);
}                          }
Unrolling Iterations

• The unrolling process stops when the number of
iterations reaches the desired unrolling factor

• The unrolled recursive procedure:
• Has base cases for larger problem sizes
• Divides the given problem into more sub-problems
of smaller sizes

• In our example:
• Base cases for n=1, n=2, and n=4
• Problems are divided into 8 problems of 1/8 size
Speedup for Matrix Multiply
Matrix of 512 x 512 elements

inline               inline+fusion
10

8
speedup

6

4

2

0
1                        2
unrolling factor
Speedup for Matrix Multiply
Matrix of 512 x 512 elements

inline               inline+fusion
10

8
speedup

6

4

2

0
1                        2
unrolling factor
Speedup for Matrix Multiply
Matrix of 1024 x 1024 elements

inline               inline+fusion
10

8
speedup

6

4

2

0
1                        2
unrolling factor
Efficiency of Unrolled Recursive Part

• Because the recursive part is also unrolled,
recursion may not exercise the large base cases

• Which base case is executed depends on the size of
the input problem

• In our example:
• For a problem of size n=8, the base case for n=1 is executed
• For a problem of size n=16, the base case for n=2 is executed
• The efficient base case for n=4 is not executed in these cases
Solution: Recursion Re-Rolling

• Roll back the recursive part of the unrolled procedure
after the large base cases are generated

• Re-Rolling ensures that larger base cases are always
executed, independent of the input problem size

• The compiler unrolls the recursive part only
temporarily, to generate the base cases
Transformation 3: Recursion Re-Rolling

void f1(char *p, int n)
if (n == 1) {
*p += 1;                else {
} else if (n/2 == 1) {       f1(p, n/2/2/2);
*p += 1;                  f1(p+n/2/2/2, n/2/2/2);
*(p+n/2) += 1;            f1(p+n/2/2, n/2/2/2);
} else if (n/2/2 == 1) {     f1(p+n/2/2+n/2/2/2, n/2/2/2);
*p += 1;                  f1(p+n/2, n/2/2/2);
*(p+n/2/2) += 1;          f1(p+n/2+n/2/2/2, n/2/2/2);
*(p+n/2) += 1;            f1(p+n/2+n/2/2, n/2/2/2);
*(p+n/2+n/2/2) += 1;      f1(p+n/2+n/2/2+n/2/2/2, n/2/2/2);
}                          }
Transformation 3: Recursion Re-Rolling
Identify the recursive part

void f1(char *p, int n)
if (n == 1) {
*p += 1;                else {
} else if (n/2 == 1) {       f1(p, n/2/2/2);
*p += 1;                  f1(p+n/2/2/2, n/2/2/2);
*(p+n/2) += 1;            f1(p+n/2/2, n/2/2/2);
} else if (n/2/2 == 1) {     f1(p+n/2/2+n/2/2/2, n/2/2/2);
*p += 1;                  f1(p+n/2, n/2/2/2);
*(p+n/2/2) += 1;          f1(p+n/2+n/2/2/2, n/2/2/2);
*(p+n/2) += 1;            f1(p+n/2+n/2/2, n/2/2/2);
*(p+n/2+n/2/2) += 1;      f1(p+n/2+n/2/2+n/2/2/2, n/2/2/2);
}                          }
Transformation 3: Recursion Re-Rolling
Replace with the recursive part of the original procedure

void f1(char *p, int n)
if (n == 1) {
*p += 1;
} else if (n/2 == 1) {
*p += 1;
*(p+n/2) += 1;
} else if (n/2/2 == 1) {
*p += 1;                else {
*(p+n/2/2) += 1;          f1(p, n/2);
*(p+n/2) += 1;            f1(p+n/2, n/2);
*(p+n/2+n/2/2) += 1;    }
}
Final Result

void f1(char *p, int n)
if (n == 1) {
*p += 1;
} else if (n/2 == 1) {
*p += 1;
*(p+n/2) += 1;
} else if (n/2/2 == 1) {
*p += 1;                else {
*(p+n/2/2) += 1;          f1(p, n/2);
*(p+n/2) += 1;            f1(p+n/2, n/2);
*(p+n/2+n/2/2) += 1;    }
}
Speedup for Matrix Multiply
Matrix of 512 x 512 elements

inline   inline+fusion   inline+fusion+reroll
10

8
speedup

6

4

2

0
1               2                3
unrolling factor
Speedup for Matrix Multiply
Matrix of 1024 x 1024 elements

inline   inline+fusion   inline+fusion+reroll
10

8
speedup

6

4

2

0
1               2                3
unrolling factor
Other Optimizations

• Inlining moves code from the inter-procedural level to
the intra-procedural level

• Conditional fusion brings code from the inter-basic-
block level to the intra-basic-block level

• Together, inlining and conditional fusion give
subsequent compiler passes the opportunity to
perform more aggressive optimizations
Comparison to Hand Coded Programs

• Two applications: Matrix multiply, LU decomposition
• Three machines: Pentium III, Origin 2000, PowerPC
• Two different problem sizes

• Compare automatically unrolled programs to optimized,
hand coded versions from the Cilk benchmarks

• Best automatically unrolled version performs:
• Between 2.2 and 2.9 times worse for matrix multiply
• As good as hand coded version for LU
Related Work

• Procedure Inlining:
•   Scheifler (1977)
•   Richardson, Ghanapathi (1989)
•   Chambers, Ungar (1989)
•   Cooper, Hall, Torczon (1991)
•   Appel (1992)
•   Chang, Mahlke, Chen, Hwu (1992)
Conclusion
• Recursion Unrolling
• analogous to the loop unrolling transformation

• Divide and Conquer Programs
• The programmer writes simple base cases
• The compiler automatically generates large base cases

• Key Techniques
• Inlining: conceptually inline recursive calls
• Conditional Fusion: simplify intra-procedural control flow
• Re-Rolling: ensure that large base cases are executed
Comparison to Hand Coded Programs

• Matrix multiply 512 x 512 elements:
• Best automatically unrolled program:      2.55 sec.
• Hand coded with three nested loops:       3.46 sec.
• Hand coded Cilk program:                  1.16 sec.

• Matrix multiply for 1024 x 1024 elements:
• Best automatically unrolled program:      20.47 sec.
• Hand coded with three nested loops:       27.40 sec.
• Hand coded Cilk program:                  9.19 sec.
Correctness

• Recursion unrolling preserves the semantics of the
program:

• The unrolled program terminates if and only if the
original recursive program terminates

• When both the original and the unrolled program
terminate, the yield the same result
Speedup for Matrix Multiply
Pentium III, Matrix of 512 x 512 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for Matrix Multiply
Pentium III, Matrix of 1024 x 1024 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for Matrix Multiply
Power PC, Matrix of 512 x 512 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for Matrix Multiply
Power PC, Matrix of 1024 x 1024 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for Matrix Multiply
Origin 2000, Matrix of 512 x 512 elements

inline   inline+fusion   inline+fusion+reroll
10

8
speedup

6

4

2

0
1               2                3
unrolling factor
Speedup for Matrix Multiply
Origin 2000, Matrix of 1024 x 1024 elements

inline   inline+fusion   inline+fusion+reroll
10

8
speedup

6

4

2

0
1               2                3
unrolling factor
Speedup for LU
Pentium III, Matrix of 512 x 512 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for LU
Pentium III, Matrix of 1024 x 1024 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for LU
Power PC, Matrix of 512 x 512 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for LU
Power PC, Matrix of 1024 x 1024 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for LU
Origin 2000, Matrix of 1024 x 1024 elements

inline   inline+fusion   inline+fusion+reroll

10

8
speedup

6

4

2

0
1               2               3
unrolling factor
Speedup for LU
Origin 2000, Matrix of 512 x 512 elements

inline   inline+fusion   inline+fusion+reroll
10

8
speedup

6

4

2

0
1               2                3
unrolling factor

```
DOCUMENT INFO
Shared By:
Categories:
Tags:
Stats:
 views: 5 posted: 10/3/2012 language: Unknown pages: 75