Embedded Systems Programming by 2du89zN

VIEWS: 5 PAGES: 15

									    Embedded Systems
      Programming

Writing Optimised C code for ARM
  Why write optimised C code?
• For embedded system size and/or speed
  are of key importance
• The compiler optimisation phase can only
  do so much
• In order to write optimal C code you need
  to know details of the underlying hardware
  and the compiler
       What compilers can’t do
• void memclr( char *      • Is N == on first loop?
  data, int N)                – 0 – 1 is dangerous!
• {                        • Is data array 4 byte
•     for (; N > 0; N--)     aligned?
•     {                       – Can store using int
•            *data=0;      • Is N a multiple of 4?
                              – Could do 4 word
•            data++;            blocks at a time
•     }                    • Compilers have to be
• }                          conservative!
            An example Program
                                   • The program might
                                     seem fine – even
/* program showing inefficient
* variable and loop
                                     resource friendly
* usage craig Nov 04
 */
                                   • Using a char saves
                                     space
int checksum_1(int *data)
{                                  • for loops make good
         char i; int sum = 0;        assembler
         for (i =0; i < 64; i++)
              sum += data[i];      • Lets look at the
         return sum;
}                                    assembler code
.text
       .align  2
       .global checksum_1
       .type    checksum_1,function
checksum_1:
       @ args = 0, pretend = 0, frame = 0
       @ frame_needed = 1, current_function_anonymous_args = 0
       mov     ip, sp
       stmfd   sp!, {fp, ip, lr, pc}
       sub     fp, ip, #4
       mov     r1, r0
       mov     r0, #0                     @ sum = 0
       mov     r2, r0                     @i=0
.L6:
       ldr     r3, [r1, r2, asl #2]       @ data[i]
       add     r0, r0, r3                 @ sum = data[i]
       add     r3, r2, #1                 @ i ++
       and     r2, r3, #255
       cmp     r2, #63                    @ i < 64
       bls     .L6
       ldmea fp, {fp, sp, pc}
.Lfe1:
       .size    checksum_1,.Lfe1-checksum_1
            What is wrong?
• The use of char means that the compiler
  has to cast to look at 8 bits – using
  – and   r2, r3, #255
• The loop variable requires a register and
  initialisation
• If the loop is called often then the tests
  and branch is quite an overhead
             Variable sizes
• In general the compiler will use 32bit
  registers for local variables but will have to
  cast them when used as 8 or 16 bit values
• If you can, use unsigned ints, if you can’t
  explicitly cast
• Using signed shorts can be quite a
  problem for compilers
                 Watch your shorts!
 short add( short a, short b)
 {
          return a + (b >> 1);
                                         • The above C code
 }                                         turns into the rather
      Becomes ….                           nasty assembler
                                         • The gnu C compiler is
mov     ip, sp                             very cautious when
        stmfd    sp!, {fp, ip, lr, pc}
        sub      fp, ip, #4                confronted with short
        mov      r1, r1, asl #16
        mov      r0, r0, asl #16
                                           variables
        mov      r0, r0, asr #16
        add      r0, r0, r1, asr #17
        mov      r0, r0, asl #16
        mov      r0, r0, asr #16
        ldmea    fp, {fp, sp, pc
               Loops #1
• As well as using a char for a loop counter
  the loop counter could be redundant
• Terminate loops by counting down to 0 the
  reduces register usage and means no
  initialisation
• Use do..while instead of for loops
Efficient loop C

*/
* Program to show efficient use of
* variables and loops
*/
int checksum_2(int *data)
{         int sum = 0, i = 64;
          do
          {
               sum += *(data++);
          } while ( --i != 0 );
          return sum;
}
    Efficient loop assembler

checksum_2:
       @ args = 0, pretend = 0, frame = 0
       @ frame_needed = 1, current_function_anonymous_args = 0
       mov     ip, sp
       stmfd   sp!, {fp, ip, lr, pc}
       sub     fp, ip, #4
       mov     r1, r0
       mov     r0, #0                     @ sum = 0
       mov     r2, #64                    @ i = 64
.L6:
       ldr     r3, [r1], #4               @ *(data++)
       add     r0, r0, r3                 @ sum = *(data++)
       subs    r2, r2, #1                 @ --i
       bne     .L6
       ldmea fp, {fp, sp, pc}
             Loop unrolling
• If a loop is going to be repeated often then
  the test and branch can be quite an
  overhead
• If the loop is a multiple of 4 and is done
  quite a lot then the loop can be unrolled
• This increases code a size but is more
  speed efficient
• Sizes that are not multiples of 4 can be
  done but are less efficient.
An unrolled loop
* Program to show efficient use of
 * variables and loops & loop unrolling
 */
int checksum_2(int *data)
{
  int sum = 0, i = 64;
  do
    {
      sum += *(data++);
      sum += *(data++);
      sum += *(data++);
      sum += *(data++);
      i -= 4;
    } while ( i != 0 );
  return sum;
}
checksum_2:
       @ args = 0, pretend = 0, frame = 0
       @ frame_needed = 1, current_function_anonymous_args = 0
       mov     ip, sp
       stmfd   sp!, {fp, ip, lr, pc}
       sub     fp, ip, #4
       mov     r2, r0
       mov     r0, #0
       mov     r1, #64
.L6:
       ldr     r3, [r2], #4
       add     r0, r0, r3
       ldr     r3, [r2], #4
       add     r0, r0, r3
       ldr     r3, [r2], #4
       add     r0, r0, r3
       ldr     r3, [r2], #4
       add     r0, r0, r3
       subs    r1, r1, #4
       bne     .L6
       ldmea fp, {fp, sp, pc}
Loop unrolling ! = 4
/* Program to show use of
 * loop unrolling
 */
int checksum_2(int *data, unsigned int N)
{
  int sum = 0;
  unsigned int i;
  for ( i = N/4; i != 0; i--)
    {
      sum += *(data++);
      sum += *(data++);
      sum += *(data++);
      sum += *(data++);
    }
  for ( i = N&3; i != 0; i--)
      sum += *(data++);
  return sum;
}

								
To top