Recursion Unrolling

Document Sample
Recursion Unrolling Powered By Docstoc
					        Recursion Unrolling
for Divide and Conquer Programs


    Radu Rugina and Martin Rinard
     Laboratory for Computer Science
   Massachusetts Institute of Technology
    What This Talk Is About



• Automatic generation of efficient
  large base cases for divide and
  conquer programs
                  Outline
1. Motivating Example
2. Computation Structure
3. Transformations
4. Related Work
5. Conclusion
1. Motivating Example
       Divide and Conquer Matrix Multiply

       A                B        =             R
  A0       A1       B0       B1       A0B0+A1B2 A0B1+A1B3
                                 =
  A2       A3       B2       B3       A2B0+A3B2 A2B1+A3B3



• Divide matrices into sub-matrices: A0 , A1, A2 etc
• Use blocked matrix multiply equations
       Divide and Conquer Matrix Multiply

       A                B        =             R
  A0       A1       B0       B1       A0B0+A1B2 A0B1+A1B3
                                 =
  A2       A3       B2       B3       A2B0+A3B2 A2B1+A3B3



• Recursively multiply sub-matrices
    Divide and Conquer Matrix Multiply

    A           B     =             R

    a0          b0    =           a0  b0




• Terminate recursion with a simple base case
    Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {  Implements   R += A  B
 if (n == 1) {
    (*R) += (*A) * (*B);
 } else {
    matmul(A, B, R, n/4);
    matmul(A, B+(n/4), R+(n/4), n/4);
    matmul(A+2*(n/4), B, R+2*(n/4), n/4);
    matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
    matmul(A+(n/4), B+2*(n/4), R, n/4);
    matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
    matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
    matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
 }
    Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {        Divide matrices in
 if (n == 1) {                                      sub-matrices and
    (*R) += (*A) * (*B);                            recursively multiply
 } else {
                                                    sub-matrices
    matmul(A, B, R, n/4);
    matmul(A, B+(n/4), R+(n/4), n/4);
    matmul(A+2*(n/4), B, R+2*(n/4), n/4);
    matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
    matmul(A+(n/4), B+2*(n/4), R, n/4);
    matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
    matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
    matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
 }
    Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {      Identify sub-matrices
 if (n == 1) {                                    with pointers
    (*R) += (*A) * (*B);
 } else {
    matmul(A, B, R, n/4);
    matmul(A, B+(n/4), R+(n/4), n/4);
    matmul(A+2*(n/4), B, R+2*(n/4), n/4);
    matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
    matmul(A+(n/4), B+2*(n/4), R, n/4);
    matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
    matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
    matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
 }
    Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {        Use a simple
 if (n == 1) {                                      algorithm for the
    (*R) += (*A) * (*B);                            base case
 } else {
    matmul(A, B, R, n/4);
    matmul(A, B+(n/4), R+(n/4), n/4);
    matmul(A+2*(n/4), B, R+2*(n/4), n/4);
    matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
    matmul(A+(n/4), B+2*(n/4), R, n/4);
    matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
    matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
    matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
 }
    Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {        • Advantage of
 if (n == 1) {                                        small base case:
    (*R) += (*A) * (*B);
                                                      simplicity
 } else {
    matmul(A, B, R, n/4);
    matmul(A, B+(n/4), R+(n/4), n/4);               • Code is easy to:
    matmul(A+2*(n/4), B, R+2*(n/4), n/4);              •   Write
    matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);
                                                       •   Maintain
    matmul(A+(n/4), B+2*(n/4), R, n/4);
    matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
                                                       •   Debug
    matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);      •   Understand
    matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
 }
    Divide and Conquer Matrix Multiply
void matmul(int *A, int *B, int *R, int n) {      • Disadvantage:
 if (n == 1) {                                      inefficiency
    (*R) += (*A) * (*B);
 } else {
    matmul(A, B, R, n/4);                         • Large control flow
    matmul(A, B+(n/4), R+(n/4), n/4);               overhead:
    matmul(A+2*(n/4), B, R+2*(n/4), n/4);         • Most of the time is
    matmul(A+2*(n/4), B+(n/4), R+3*(n/4), n/4);     spent in dividing the
    matmul(A+(n/4), B+2*(n/4), R, n/4);             matrix in sub-matrices
    matmul(A+(n/4), B+3*(n/4), R+(n/4), n/4);
    matmul(A+3*(n/4), B+2*(n/4), R+2*(n/4), n/4);
    matmul(A+3*(n/4), B+3*(n/4), R+3*(n/4), n/4);
 }
                            Hand Coded Implementation
void serialmul(block *As, block *Bs, block *Rs)            s0_0   +=   ap[5] * bp[80];                 s1_1 += ap[29] *   bp[209];
{                                                          s0_1   +=   ap[5] * bp[81];                 s0_0 += ap[14] *   bp[224];
    int i, j;                                              s1_0   +=   ap[21] * bp[80];                s0_1 += ap[14] *   bp[225];
                                                           s1_1   +=   ap[21] * bp[81];                s1_0 += ap[30] *   bp[224];
   DOUBLE *A = (DOUBLE *) As;                              s0_0   +=   ap[6] * bp[96];                 s1_1 += ap[30] *   bp[225];
   DOUBLE *B = (DOUBLE *) Bs;                              s0_1   +=   ap[6] * bp[97];                 s0_0 += ap[15] *   bp[240];
   DOUBLE *R = (DOUBLE *) Rs;                              s1_0   +=   ap[22] * bp[96];                s0_1 += ap[15] *   bp[241];
                                                           s1_1   +=   ap[22] * bp[97];                s1_0 += ap[31] *   bp[240];
   for (j = 0; j < 16; j += 2) {                           s0_0   +=   ap[7] * bp[112];                s1_1 += ap[31] *   bp[241];
       DOUBLE *bp = &B[j];                                 s0_1   +=   ap[7] * bp[113];                rp[0] = s0_0;
       for (i = 0; i < 16; i += 2) {                       s1_0   +=   ap[23] * bp[112];               rp[1] = s0_1;
           DOUBLE *ap = &A[i * 16];                        s1_1   +=   ap[23] * bp[113];               rp[16] = s1_0;
           DOUBLE *rp = &R[j + i * 16];                    s0_0   +=   ap[8] * bp[128];                rp[17] = s1_1;
           register DOUBLE s0_0 = rp[0], s0_1 = rp[1];     s0_1   +=   ap[8] * bp[129];            }
           register DOUBLE s1_0 = rp[16], s1_1 = rp[17];   s1_0   +=   ap[24] * bp[128];       }
           s0_0 += ap[0] * bp[0];                          s1_1   +=   ap[24] * bp[129];   }
           s0_1 += ap[0] * bp[1];                          s0_0   +=   ap[9] * bp[144];
           s1_0 += ap[16] * bp[0];                         s0_1   +=   ap[9] * bp[145];    cilk void matrixmul(long nb, block *A, block *B, block *R)
           s1_1 += ap[16] * bp[1];                         s1_0   +=   ap[25] * bp[144];   {
           s0_0 += ap[1] * bp[16];                         s1_1   +=   ap[25] * bp[145];       if (nb == 1) {
           s0_1 += ap[1] * bp[17];                         s0_0   +=   ap[10] * bp[160];            flops = serialmul(A, B, R);
           s1_0 += ap[17] * bp[16];                        s0_1   +=   ap[10] * bp[161];       } else if (nb >= 4) {
           s1_1 += ap[17] * bp[17];                        s1_0   +=   ap[26] * bp[160];              spawn matrixmul(nb/4, A, B, R);
           s0_0 += ap[2] * bp[32];                         s1_1   +=   ap[26] * bp[161];              spawn matrixmul(nb/4, A, B+(nb/4), R+(nb/4));
           s0_1 += ap[2] * bp[33];                         s0_0   +=   ap[11] * bp[176];              spawn matrixmul(nb/4, A+2*(nb/4), B+(nb/4), R+2*(nb/4));
           s1_0 += ap[18] * bp[32];                        s0_1   +=   ap[11] * bp[177];              spawn matrixmul(nb/4, A+2*(nb/4), B, R+3*(nb/4));
           s1_1 += ap[18] * bp[33];                        s1_0   +=   ap[27] * bp[176];              sync;
           s0_0 += ap[3] * bp[48];                         s1_1   +=   ap[27] * bp[177];              spawn matrixmul(nb/4, A+(nb/4), B+2*(nb/4), R);
           s0_1 += ap[3] * bp[49];                         s0_0   +=   ap[12] * bp[192];              spawn matrixmul(nb/4, A+(nb/4), B+3*(nb/4), R+(nb/4));
           s1_0 += ap[19] * bp[48];                        s0_1   +=   ap[12] * bp[193];              spawn matrixmul(nb/4, A+3*(nb/4), B+3*(nb/4), R+2*(nb/4));
           s1_1 += ap[19] * bp[49];                        s1_0   +=   ap[28] * bp[192];              spawn matrixmul(nb/4, A+3*(nb/4), B+3*(nb/4), R+3*(nb/4));
           s0_0 += ap[4] * bp[64];                         s1_1   +=   ap[28] * bp[193];              sync;
           s0_1 += ap[4] * bp[65];                         s0_0   +=   ap[13] * bp[208];       }
           s1_0 += ap[20] * bp[64];                        s0_1   +=   ap[13] * bp[209];   }
           s1_1 += ap[20] * bp[65];                        s1_0   +=   ap[29] * bp[208];
                Goal


• The programmer writes simple code
  with small base cases

• The compiler automatically generates
  efficient code with large base cases
2. Computation Structure
 Running Example – Array Increment

void f(char *p, int n)
  if (n == 1) {
       /* base case: increment one element */
       (*p) += 1;
  } else {
       f(p, n/2);        /* increment first half */
       f(p+n/2, n/2);    /* increment second half */
  }
}
Dynamic Call Tree for n=4
      Execution of f(p,4)
Dynamic Call Tree for n=4
      Execution of f(p,4)
           Test n=1
         Call f Call f
           Dynamic Call Tree for n=4
                   Execution of f(p,4)
Activation Frame        Test n=1
  on the Stack        Call f Call f
Dynamic Call Tree for n=4
      Execution of f(p,4)
           Test n=1          Executed
         Call f Call f      Instructions
Dynamic Call Tree for n=4
      Execution of f(p,4)
           Test n=1
         Call f Call f
      Dynamic Call Tree for n=4
               Execution of f(p,4)
                        Test n=1
n=4
                      Call f Call f



        Test n=1                        Test n=1
n=2
      Call f Call f                   Call f Call f
        Dynamic Call Tree for n=4
                   Execution of f(p,4)
                            Test n=1
n=4
                          Call f Call f



            Test n=1                        Test n=1
n=2
          Call f Call f                   Call f Call f



      Test n=1    Test n=1        Test n=1        Test n=1
n=1
       Inc *p      Inc *p          Inc *p          Inc *p
           Control Flow Overhead
                   Execution of f(p,4)

n=4
                            Test n=1                Call overhead
                          Call f Call f



            Test n=1                        Test n=1
n=2
          Call f Call f                   Call f Call f



      Test n=1    Test n=1        Test n=1        Test n=1
n=1
       Inc *p      Inc *p          Inc *p          Inc *p
           Control Flow Overhead
                   Execution of f(p,4)

n=4
                            Test n=1                Call overhead +
                          Call f Call f              Test overhead


            Test n=1                        Test n=1
n=2
          Call f Call f                   Call f Call f



      Test n=1    Test n=1        Test n=1        Test n=1
n=1
       Inc *p      Inc *p          Inc *p          Inc *p
                   Computation
                   Execution of f(p,4)

n=4
                            Test n=1                Call overhead +
                          Call f Call f              Test overhead
                                                    Computation

            Test n=1                        Test n=1
n=2
          Call f Call f                   Call f Call f



      Test n=1    Test n=1        Test n=1        Test n=1
n=1
       Inc *p      Inc *p          Inc *p          Inc *p
 Large Base Cases = Reduced Overhead
               Execution of f(p,4)
                       Test n=2
n=4
                     Call f Call f



         Test n=2                     Test n=2
n=2
          Inc *p                       Inc *p
        Inc *(p+1)                   Inc *(p+1)
3. Transformations
  Transformation 1: Recursion Inlining
     Start with the original recursive procedure

void f (char *p, int n)
  if (n == 1) {
       (*p) += 1;
  } else {
       f(p, n/2);
       f(p+n/2, n/2);
  }
  Transformation 1: Recursion Inlining
     Make two copies of the original procedure

void f1(char *p, int n)     void f2(char *p, int n)
  if (n == 1) {               if (n == 1) {
       (*p) += 1;                  (*p) += 1;
  } else {                    } else {
       f1(p, n/2);                 f2(p, n/2);
       f1(p+n/2, n/2);             f2(p+n/2, n/2);
  }                           }
  Transformation 1: Recursion Inlining
   Transform direct recursion to mutual recursion

void f1(char *p, int n)     void f2(char *p, int n)
  if (n == 1) {               if (n == 1) {
       (*p) += 1;                  (*p) += 1;
  } else {                    } else {
       f2(p, n/2);                 f1(p, n/2);
       f2(p+n/2, n/2);             f1(p+n/2, n/2);
  }                           }
  Transformation 1: Recursion Inlining
        Inline procedure f2 at call sites in f1

void f1(char *p, int n)       void f2(char *p, int n)
  if (n == 1) {                 if (n == 1) {
       (*p) += 1;                    (*p) += 1;
  } else {                      } else {
       f2(p, n/2);                   f1(p, n/2);
       f2(p+n/2, n/2);               f1(p+n/2, n/2);
  }                             }
Transformation 1: Recursion Inlining
 void f1(char *p, int n)
   if (n == 1) {
      (*p) += 1;
   } else {
      if (n/2 == 1) {
         *p += 1;
      } else {
         f1(p, n/2/2);
         f1(p+n/2/2, n/2/2);
      }
      if (n/2 == 1) {
         *(p+n/2) += 1;
      } else {
         f1(p+n/2, n/2/2);
         f1(p+n/2+n/4, n/2/2);
      }
   }
Transformation 1: Recursion Inlining
 void f1(char *p, int n)
   if (n == 1) {
                                 • Reduced procedure
      (*p) += 1;                   call overhead
   } else {
      if (n/2 == 1) {
         *p += 1;
                                 • More code exposed
      } else {                     at the intra-procedural
         f1(p, n/2/2);             level
         f1(p+n/2/2, n/2/2);
      }
      if (n/2 == 1) {            • Opportunities to
         *(p+n/2) += 1;            simplify control flow
      } else {                     in the inlined code
         f1(p+n/2, n/2/2);
         f1(p+n/2+n/4, n/2/2);
      }
   }
Transformation 1: Recursion Inlining
 void f1(char *p, int n)
   if (n == 1) {
                                 • Reduced procedure
      (*p) += 1;                   call overhead
   } else {
      if (n/2 == 1) {
         *p += 1;
                                 • More code exposed
      } else {                     at the intra-procedural
         f1(p, n/2/2);             level
         f1(p+n/2/2, n/2/2);
      }
      if (n/2 == 1) {            • Opportunities to
         *(p+n/2) += 1;            simplify control flow
      } else {                     in the inlined code:
         f1(p+n/2, n/2/2);
                                 • identical condition
         f1(p+n/2+n/4, n/2/2);
      }
                                   expressions
   }
Transformation 2: Conditional Fusion
  Merge if statements with identical conditions

void f1(char *p, int n)
  if (n == 1) {
     *p += 1;
  } else if (n/2 == 1) {
     *p += 1;
     *(p+n/2) += 1;
  } else {
     f1(p, n/2/2);
     f1(p+n/2/2, n/2/2);
     f1(p+n/2, n/2/2);
     f1(p+n/2+n/4, n/2/2);
  }
Transformation 2: Conditional Fusion
  Merge if statements with identical conditions

void f1(char *p, int n)      • Reduced branching
  if (n == 1) {                overhead and bigger
     *p += 1;                  basic blocks
  } else if (n/2 == 1) {
     *p += 1;
                             • Larger base case for
     *(p+n/2) += 1;
                               n/2 = 1
  } else {
     f1(p, n/2/2);
     f1(p+n/2/2, n/2/2);
     f1(p+n/2, n/2/2);
     f1(p+n/2+n/4, n/2/2);
  }
              Unrolling Iterations
 Repeatedly apply inlining and conditional fusion
void f1(char *p, int n)
  if (n == 1) {
     *p += 1;
  } else if (n/2 == 1) {
     *p += 1;
     *(p+n/2) += 1;
  } else {
     f1(p, n/2/2);
     f1(p+n/2/2, n/2/2);
     f1(p+n/2, n/2/2);
     f1(p+n/2+n/4, n/2/2);
  }
        Second Unrolling Iteration

void f1(char *p, int n)      void f2(char *p, int n)
  if (n == 1) {                if (n == 1) {
     *p += 1;                     *p += 1;
  } else if (n/2 == 1) {       } else {
     *p += 1;                     f2(p, n/2);
     *(p+n/2) += 1;               f2(p+n/2, n/2);
  } else {                     }
     f1(p, n/2/2);
     f1(p+n/2/2, n/2/2);
     f1(p+n/2, n/2/2);
     f1(p+n/2+n/4, n/2/2);
  }
        Second Unrolling Iteration

void f1(char *p, int n)      void f2(char *p, int n)
  if (n == 1) {                if (n == 1) {
     *p += 1;                     *p += 1;
  } else if (n/2 == 1) {       } else {
     *p += 1;                     f1(p, n/2);
     *(p+n/2) += 1;               f1(p+n/2, n/2);
  } else {                     }
     f2(p, n/2/2);
     f2(p+n/2/2, n/2/2);
     f2(p+n/2, n/2/2);
     f2(p+n/2+n/4, n/2/2);
  }
Result of Second Unrolling Iteration

void f1(char *p, int n)
  if (n == 1) {
     *p += 1;                else {
  } else if (n/2 == 1) {       f1(p, n/2/2/2);
     *p += 1;                  f1(p+n/2/2/2, n/2/2/2);
     *(p+n/2) += 1;            f1(p+n/2/2, n/2/2/2);
  } else if (n/2/2 == 1) {     f1(p+n/2/2+n/2/2/2, n/2/2/2);
     *p += 1;                  f1(p+n/2, n/2/2/2);
     *(p+n/2/2) += 1;          f1(p+n/2+n/2/2/2, n/2/2/2);
     *(p+n/2) += 1;            f1(p+n/2+n/2/2, n/2/2/2);
     *(p+n/2+n/2/2) += 1;      f1(p+n/2+n/2/2+n/2/2/2, n/2/2/2);
  }                          }
             Unrolling Iterations

• The unrolling process stops when the number of
  iterations reaches the desired unrolling factor

• The unrolled recursive procedure:
   • Has base cases for larger problem sizes
   • Divides the given problem into more sub-problems
     of smaller sizes

• In our example:
   • Base cases for n=1, n=2, and n=4
   • Problems are divided into 8 problems of 1/8 size
               Speedup for Matrix Multiply
                 Matrix of 512 x 512 elements

                   inline               inline+fusion
          10

          8
speedup




          6

          4

          2

          0
                      1                        2
                            unrolling factor
               Speedup for Matrix Multiply
                 Matrix of 512 x 512 elements

                   inline               inline+fusion
          10

          8
speedup




          6

          4

          2

          0
                      1                        2
                            unrolling factor
               Speedup for Matrix Multiply
                Matrix of 1024 x 1024 elements

                   inline               inline+fusion
          10

          8
speedup




          6

          4

          2

          0
                      1                        2
                            unrolling factor
  Efficiency of Unrolled Recursive Part

• Because the recursive part is also unrolled,
   recursion may not exercise the large base cases

• Which base case is executed depends on the size of
  the input problem

• In our example:
   • For a problem of size n=8, the base case for n=1 is executed
   • For a problem of size n=16, the base case for n=2 is executed
   • The efficient base case for n=4 is not executed in these cases
      Solution: Recursion Re-Rolling

• Roll back the recursive part of the unrolled procedure
  after the large base cases are generated

• Re-Rolling ensures that larger base cases are always
  executed, independent of the input problem size

• The compiler unrolls the recursive part only
  temporarily, to generate the base cases
Transformation 3: Recursion Re-Rolling

 void f1(char *p, int n)
   if (n == 1) {
      *p += 1;                else {
   } else if (n/2 == 1) {       f1(p, n/2/2/2);
      *p += 1;                  f1(p+n/2/2/2, n/2/2/2);
      *(p+n/2) += 1;            f1(p+n/2/2, n/2/2/2);
   } else if (n/2/2 == 1) {     f1(p+n/2/2+n/2/2/2, n/2/2/2);
      *p += 1;                  f1(p+n/2, n/2/2/2);
      *(p+n/2/2) += 1;          f1(p+n/2+n/2/2/2, n/2/2/2);
      *(p+n/2) += 1;            f1(p+n/2+n/2/2, n/2/2/2);
      *(p+n/2+n/2/2) += 1;      f1(p+n/2+n/2/2+n/2/2/2, n/2/2/2);
   }                          }
Transformation 3: Recursion Re-Rolling
              Identify the recursive part

 void f1(char *p, int n)
   if (n == 1) {
      *p += 1;                else {
   } else if (n/2 == 1) {       f1(p, n/2/2/2);
      *p += 1;                  f1(p+n/2/2/2, n/2/2/2);
      *(p+n/2) += 1;            f1(p+n/2/2, n/2/2/2);
   } else if (n/2/2 == 1) {     f1(p+n/2/2+n/2/2/2, n/2/2/2);
      *p += 1;                  f1(p+n/2, n/2/2/2);
      *(p+n/2/2) += 1;          f1(p+n/2+n/2/2/2, n/2/2/2);
      *(p+n/2) += 1;            f1(p+n/2+n/2/2, n/2/2/2);
      *(p+n/2+n/2/2) += 1;      f1(p+n/2+n/2/2+n/2/2/2, n/2/2/2);
   }                          }
 Transformation 3: Recursion Re-Rolling
Replace with the recursive part of the original procedure

    void f1(char *p, int n)
      if (n == 1) {
         *p += 1;
      } else if (n/2 == 1) {
         *p += 1;
         *(p+n/2) += 1;
      } else if (n/2/2 == 1) {
         *p += 1;                else {
         *(p+n/2/2) += 1;          f1(p, n/2);
         *(p+n/2) += 1;            f1(p+n/2, n/2);
         *(p+n/2+n/2/2) += 1;    }
      }
                    Final Result

void f1(char *p, int n)
  if (n == 1) {
     *p += 1;
  } else if (n/2 == 1) {
     *p += 1;
     *(p+n/2) += 1;
  } else if (n/2/2 == 1) {
     *p += 1;                else {
     *(p+n/2/2) += 1;          f1(p, n/2);
     *(p+n/2) += 1;            f1(p+n/2, n/2);
     *(p+n/2+n/2/2) += 1;    }
  }
               Speedup for Matrix Multiply
                   Matrix of 512 x 512 elements

                inline   inline+fusion   inline+fusion+reroll
          10

          8
speedup




          6

          4

          2

          0
                     1               2                3
                              unrolling factor
               Speedup for Matrix Multiply
                 Matrix of 1024 x 1024 elements

                inline   inline+fusion   inline+fusion+reroll
          10

          8
speedup




          6

          4

          2

          0
                     1               2                3
                              unrolling factor
              Other Optimizations

• Inlining moves code from the inter-procedural level to
  the intra-procedural level

• Conditional fusion brings code from the inter-basic-
  block level to the intra-basic-block level

• Together, inlining and conditional fusion give
  subsequent compiler passes the opportunity to
  perform more aggressive optimizations
 Comparison to Hand Coded Programs

• Two applications: Matrix multiply, LU decomposition
• Three machines: Pentium III, Origin 2000, PowerPC
• Two different problem sizes

• Compare automatically unrolled programs to optimized,
  hand coded versions from the Cilk benchmarks

• Best automatically unrolled version performs:
   • Between 2.2 and 2.9 times worse for matrix multiply
   • As good as hand coded version for LU
                Related Work

• Procedure Inlining:
  •   Scheifler (1977)
  •   Richardson, Ghanapathi (1989)
  •   Chambers, Ungar (1989)
  •   Cooper, Hall, Torczon (1991)
  •   Appel (1992)
  •   Chang, Mahlke, Chen, Hwu (1992)
                   Conclusion
• Recursion Unrolling
  • analogous to the loop unrolling transformation


• Divide and Conquer Programs
  • The programmer writes simple base cases
  • The compiler automatically generates large base cases


• Key Techniques
  • Inlining: conceptually inline recursive calls
  • Conditional Fusion: simplify intra-procedural control flow
  • Re-Rolling: ensure that large base cases are executed
 Comparison to Hand Coded Programs

• Matrix multiply 512 x 512 elements:
  • Best automatically unrolled program:      2.55 sec.
  • Hand coded with three nested loops:       3.46 sec.
  • Hand coded Cilk program:                  1.16 sec.

• Matrix multiply for 1024 x 1024 elements:
  • Best automatically unrolled program:      20.47 sec.
  • Hand coded with three nested loops:       27.40 sec.
  • Hand coded Cilk program:                  9.19 sec.
                    Correctness

• Recursion unrolling preserves the semantics of the
  program:

   • The unrolled program terminates if and only if the
     original recursive program terminates

   • When both the original and the unrolled program
     terminate, the yield the same result
               Speedup for Matrix Multiply
           Pentium III, Matrix of 512 x 512 elements

                inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                     1               2               3
                             unrolling factor
               Speedup for Matrix Multiply
          Pentium III, Matrix of 1024 x 1024 elements

                inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                     1               2               3
                             unrolling factor
                Speedup for Matrix Multiply
               Power PC, Matrix of 512 x 512 elements

                  inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                       1               2               3
                               unrolling factor
               Speedup for Matrix Multiply
           Power PC, Matrix of 1024 x 1024 elements

                inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                     1               2               3
                             unrolling factor
               Speedup for Matrix Multiply
           Origin 2000, Matrix of 512 x 512 elements

                inline   inline+fusion   inline+fusion+reroll
          10

          8
speedup




          6

          4

          2

          0
                     1               2                3
                              unrolling factor
               Speedup for Matrix Multiply
          Origin 2000, Matrix of 1024 x 1024 elements

                inline   inline+fusion   inline+fusion+reroll
          10

          8
speedup




          6

          4

          2

          0
                     1               2                3
                              unrolling factor
                        Speedup for LU
           Pentium III, Matrix of 512 x 512 elements

               inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                    1               2               3
                             unrolling factor
                        Speedup for LU
          Pentium III, Matrix of 1024 x 1024 elements

               inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                    1               2               3
                            unrolling factor
                           Speedup for LU
               Power PC, Matrix of 512 x 512 elements

                  inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                       1               2               3
                                unrolling factor
                        Speedup for LU
           Power PC, Matrix of 1024 x 1024 elements

               inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                    1               2               3
                            unrolling factor
                        Speedup for LU
          Origin 2000, Matrix of 1024 x 1024 elements

               inline   inline+fusion   inline+fusion+reroll

          10

          8
speedup




          6

          4

          2

          0
                    1               2               3
                             unrolling factor
                        Speedup for LU
           Origin 2000, Matrix of 512 x 512 elements

               inline   inline+fusion   inline+fusion+reroll
          10

          8
speedup




          6

          4

          2

          0
                    1               2                3
                             unrolling factor

				
DOCUMENT INFO
Shared By:
Categories:
Tags:
Stats:
views:5
posted:10/3/2012
language:Unknown
pages:75