#!/usr/bin/perl -w use strict; use POSIX qw(ceil floor); # Check to make sure that one file was given on the command line, if not # print a correct usage statement. # ##### ##### @ARGV is the array of the command line arguments. != is the operator ##### for not equals. ##### ##### We are seeing whether the number of command line arguments is 1 ##### ie. NOT (NOT EQUALS 1) is EQUALS 1, and bypasses the if statement ##### if (@ARGV != 1) { die "\nUsage: perl nuc_count.pl \n\n"; } # Get the file name and open an instream to it. # ##### ##### shift is pulling the first element of off the array. ##### An array is basically like a list, we can go over this. ##### my $file = shift @ARGV; ##### ##### Open file for reading or if that fails, then die (kill yourself) ##### and write the error message. ##### open (IN, "<$file") or die ("Couldn't open file: $file\n"); #Initialize sequence as an empty string # ##### ##### We can see that $sequence is initialized. This is highly important ##### you otherwise would not know what was in $sequence to start with ##### (especially in other programming languages.) ##### my $sequence = ""; # Read in the contents of the file one line at a time and store it in $line # ##### ##### Remember the while loop, here we are reading each line until the EOF ##### (End of file). $line is only defined for the loop ... this is called ##### scope. Outside of the loop $line is no longer defined. ##### while (my $line = ) { # if it's a line that begins with a word character (if ($line =~ /^\w/)) # chomp off the newline (chomp($line)) # uppercase it for easier counting (uc($line) # and append it to the end of $sequence, the string that is the growing # sequence ($sequence .= uc($line)) . # # This will skip the fasta header lines, since they always begin # with a ">". It will also skip blank lines. # if ($line =~ /^\w/) { chomp($line); $sequence .= uc($line); } ##### ##### Another way of coding this would be to check if the line is not > or ##### whitespacee or blank, then do the following ... the code follows ##### if ( ($line !~ /^>/) && ($line !~ /^\s*$/) ) { ##### chomp ($line); ##### $sequence .= uc($line) ##### } ##### ##### (Note, this method is not as good, as we are trying to figure ##### out what we will not see, rather than what we want.) ##### ##### the !~ (means not similar to) the =~ (means similar to) ##### the /^ means starts with the $/ means ends with ##### \s is whitespace the * is (0 or more) + is (1 or more) ##### } #We're done with the file, so close it # close IN; # Count the number of A's, T's, G's, and C's in $sequence using s///g in # a slightly funky way. Using s///g is a fast way to count characters # in a string. s stands for substitute and hence substitutes the string # in the left slashes for the one in the right slashes, the g indicates # to preform this globally (without g it will only do it once). And it's a # helluva lot easier to code than tearing the whole string into an array # and counting that way (unless you need the sequence in array form # for another reason ...) # # For more exciting information on the powerful s/// operator, see # "Learning Perl," pp 13 and 88. # my $a = $sequence =~ s/A/A/g; my $t = $sequence =~ s/T/T/g; my $g = $sequence =~ s/G/G/g; my $c = $sequence =~ s/C/C/g; my $seq_length = length($sequence); #calculate nucleotide frequencies while rounding the values: #Rounding the values this way might effect the cttg_exp calculation #One alternative is the substr function my $a_freq = sprintf("%.3f", $a/$seq_length); my $t_freq = sprintf("%.3f", $t/$seq_length); my $g_freq = sprintf("%.3f", $g/$seq_length); my $c_freq = sprintf("%.3f", $c/$seq_length); # Count the actual number of CTTG occurances my $cttg = $sequence =~ s/CTTG/CTTG/g; #Calculate expected CTTG occurances #The int function is being used to round the value down to the nearest whole integer #It would not make sense to expect a piece of a CTTG my $cttg_exp = int($c_freq*$t_freq*$t_freq*$g_freq*($seq_length-3)); #print the result # print "Actual CTTG occurances: $cttg\n\n"; print "Nucleotide frequencies:\n"; print "A: $a_freq\nT: $t_freq\nG: $g_freq\nC: $c_freq\n\n"; print "Expected CTTG occurances: $cttg_exp\n"; ##### ##### IN THIS CASE, YOU WILL LIKELY RUN OUT OF MEMORY WITH THIS METHOD !!!!! ##### USE THESE TO LEARN SYNTAX !!!! ##### ##### an alternate method of doing the same thing .... ##### We will split up the sequence into an array of characters and then ##### count those characters. ##### ##### my $a=0; my $c=0; my $g=0; my $t=0; my $n=0; ##### my @seq_array = split //, $sequence; ##### foreach my $char (@seq_array) { ##### $a++ if ($char eq "A"); ##### $c++ if ($char eq "C"); ##### $g++ if ($char eq "G"); ##### $t++ if ($char eq "T"); ##### $n++ if ($char eq "N"); ##### } ##### print "A: $a\nT: $t\nG: $g\nC: $c\nN: $n\n"; ##### ##### ##### Another alternative method (even more complicated), using hashes ##### hashes are ways of making key-value pairs ##### ##### my %seq_counts = (); ##### my @seq_array = split //, $sequence; ##### foreach my $char (@seq_array) { ##### $seq_counts{$char}++; ##### } ##### foreach my $key (sort keys %seq_counts) { ##### print "$key: ", $seq_counts{$key}, "\n"; ##### } #####