Programming and Perl for Bioinformatics Part IV

Document Sample
Programming and Perl for Bioinformatics Part IV Powered By Docstoc
					Programming and Perl
         for
   Bioinformatics

       Part IV
  Subroutines
Making Programs Reusable
           by
 Creating New Functions
                    Subroutines – Why?
            my $dna = 'CGACGTCTTCTTAGGCGA'; # Sequence with 3’ UTR
            $dna =~ s/$stop_codon.*//; # Remove stop codon, 3’ UTR
            my $len = length($dna)/3; # resulting protein length

            # A bunch of other stuff…

            # Later, do the same thing to another sequence
            my $new_dna = <FASTA>; # Read in DNA from FASTA file

            # Do the same thing
            $dna =~ s/$stop_codon.*//;
            $len = length($dna)/3;


        Harder to read larger program
        What if we need to remove 5' UTR? Update
        every copy
6/13/2010                       Perl in a Day - Subroutines          3
                  Subroutines – Example
    my $dna1 = 'CGACGTCTTCTCAGGCGA';
    my $len = &get_translated_length($dna1); # call subroutine
    print "DNA with 3' UTR: $dna1. Protein length: $len\n";

    my $dna2 = <FASTA>;
    # Call the subroutine again, with a different argument
    $len = &get_translated_length($dna2); print $len;

    sub get_translated_length {
        my ($dna) = @_; #changing $dna won't change $dna1 in main
        $dna =~ s/$stop_codon.*//; # Remove stop codon & 3' UTR
        my $plen = length($dna)/3; # length of resulting protein
        return $plen;
    }



             Only one copy of the code
             Main program becomes shorter and simpler
6/13/2010                   Perl in a Day - Subroutines             4
        Subroutines – View from the Outside

             Subroutines: Write your own Perl functions
             main program calls subroutine
                 &get_translated_length
             It passes one or more arguments ($new_dna)
             Code in the subroutine gets executed

             Subroutine returns results ($plen) to caller
              Perl subroutines can return multiple values
              Some subroutines return no values
6/13/2010                     Perl in a Day - Subroutines    5
            Subroutines – View from the Inside
       # Comments describe the subroutine
       sub some_name { # starts a subroutine
          my ($thing, $other, ...) = @_;
                               # gets the arguments
          # Put fancy code here…      - calculates, prints,
         # More code…         - does other stuff
         # More               - calls other subroutines?
         return ($first, $second, ...);
                        # returns stuff to caller
       }                # ends subroutine

6/13/2010                    Perl in a Day - Subroutines      6
                     Arguments
   $n = &max(10, 15); # This sub call has two parameters

   sub max { # find the maximum value
       if ($_[0] > $_[1]) {
            $_[0];
       } else {
            $_[1];
       }
     }
   $n = &max(10, 15, 27);        # Oops!
             Arguments - Private Variables

   You can create private variables called lexical variables at any
    time with the “my” operator:
   sub max {
      my($a, $b);    # new, private variables for this         block
      ($a, $b) = @_ ; # give names to the parameters
      if ($a > $b) { $a } else { $b }
    }
   Or just use:
    my($a, $b) =      @_ ;   # Name the subroutine parameters
        Variable-length Parameter Lists
   The subroutine can easily check that it has the right
    number of arguments by examining the @_ array.

   sub max {
       if (@_ != 2) {
       print "WARNING! &max should get exactly
                  two arguments!\n";
       }
       # continue as before... . . .
    }
         Variable-length Parameter Lists
   In real-world Perl programming, this sort of check is hardly ever used; it's
    better to make the subroutine adapt to the parameters.

   $maximum = &max(3, 5, 10, 4, 6);
   sub max {
        my($max_so_far) = shift @_;
                       # the first one is the largest yet seen
          foreach (@_){ # look at the remaining arguments
              if ($_ > $max_so_far){
                $max_so_far = $_;
              }
          }
          $max_so_far;
    }
          Notes on Lexical Variable

   my ( $num) = @_ ;
    # List context; same as ($num) = @_
    # get the first element of the array


   my $num = @_ ;
    # Scalar context; same as $num = @_
    # get the size of the array
                        Scoping Rule
   #!/usr/bin/perl -w
    # Illustrating the pitfalls of not using my
    # variables
    $dna = 'AAAAA';
    $result = A_to_T($dna);
    print "I changed all the A's in $dna to T's
       and got $result\n\n";

   sub A_to_T {
       my ( $input ) = @_;
       $dna = $input;
       $dna =~ s/A/T/g;
       return $dna;
                      I changed all the A's in TTTTT to T's and got TTTTT
    }
                        Scoping Rule
   #!/usr/bin/perl -w
    # Illustrating the pitfalls of not using my
    # variables
    $dna = 'AAAAA'; # global variable
    $result = A_to_T($dna);
    print "I changed all the A's in $dna to T's
       and got $result\n\n";

   sub A_to_T {
       my ( $input ) = @_;
       my ( $dna ) = $input; # different scope
       $dna =~ s/A/T/g;
       return $dna;
    }                I changed all the A's in AAAAA to T's and got TTTTT
   Subroutines – Organizing Code By Function
               Code reuse
                 Call same subroutine from different parts of your program
                 More general:
                          $len = &get_protein_length($dna, $remove);
               Organization
                 E.g., separate messy math from main program flow
                 Each subroutine can only mess up its own variables

               Easier testing
                   Test subroutine code separately
               Increased efficiency
                   Write code just once, optimize just one sub
               Coder's Creed: Never write the same code twice
6/13/2010                                Perl in a Day - Subroutines          14
                                   Modules
            A set of related subroutines
             Placed in a separate file

             Included in the original file with the use
             command
           We've been using modules all along
             use strict;

                # strict is a special module called a “pragma”

               use BeginPerlBioinfo;
                # book’s module; download BeginPerlBioinfo.pm

6/13/2010                            Perl in a Day - Modules     15
                             Modules
      Getting new modules
       Thousands     of modules available at www.cpan.org
       search.cpan.org (E.g., search for "transcription factor")

       Usually simple to install

       Basically, installation places .pm file(s) in /usr/…

       Or a different directory Perl knows to look in


       Benefits (like subroutine benefits, but more so)
       Organization:separate a set of functionality
       Code reuse: don't have to re-write code for every program

       Modules also give you access to new classes…

6/13/2010                      Perl in a Day - Modules              16
                 Example of Module
   #!/usr/bin/perl
    # file: basename.pl
    use strict;
    use File::Basename;
    my $path = '/home/achou/perl/myprogram.pl';
    my $base = basename($path);
    my $dir = dirname($path);
    print "The base is $base and the directory
             is $dir.\n";
Output:
The base is myprogram.pl and the directory is /home/achou/perl.
            Anatomy of a Module File
   package MySequence;
    #file: MySequence.pm
    use strict;
    our $EcoRI = 'ggatcc';
    sub reversec {
        my $sequence = shift;
        $sequence = reverse $sequence;
        $sequence =~ tr/gatcGATC/ctagCTAG/;
        return $sequence;
    }
    sub seqlen {
        my $sequence = shift;
        $sequence =~ s/[^gatcnGATCN]//g;
        return length $sequence;
    }

    1;
            Anatomy of a Module File
   A module begins with the keyword package and ends
    with "1;".
   “package” gives the module a name, and the 1; is a
    true value that tells Perl that the module compiled
    completely without crashing.
   The our keyword declares a variable to be global to the
    module. It is similar to my, but the variable can be
    shared with other programs and modules ("my"
    variables cannot be shared outside the current file,
    subroutine or block). This will let us use the variable in
    other programs that depend on this module.
    Using the MySequence.pm Module

   #!/usr/bin/perl
    #file: sequence.pl
    use strict;
    use MySequence;
    my $sequence =
       'gattccggatttccaaagggttcccaatttggg';
    my $complement =
       MySequence::reversec($sequence);
    print "original = $sequence\n";
    print "complement = $complement\n";
    Using the MySequence.pm Module

   Unless you explicitly export variables or
    functions, the calling function must explicitly
    qualify each MySequence function by using the
    notation:
        MySequence::function_name

   For a non-exported variable, the notation looks
    like this:
        $MySequence::EcoRI