#!/usr/bin/perl -w # Check to make sure that one file was given on the command line, if not # print a correct usage statement. # if (@ARGV != 1) { die "\nUsage: perl nuc_count.pl \n\n"; } # Get the file name and open an instream to it. # $file = shift; open IN, $file or die "Couldn't open file: $file\n"; #Initialize sequence as an empty string # $sequence = ""; # Read in the contents of the file one line at a time and store it in $line # while ($line = ) { # if it's a line that begins with a word character (if ($line =~ /^\w/)) # chomp off the newline (chomp($line)) # uppercase it for easier counting (uc($line) # and append it to the end of $sequence, the string that is the growing # sequence ($sequence .= uc($line)) . # # This will skip the fasta header lines, since they always begin # with a ">". It will also skip blank lines. # if ($line =~ /^\w/) { #If the line is sequence then store it in $sequence chomp($line); $sequence .= uc($line); } elsif ($line =~ /^>/){ #If we see a line that starts with >, we're in a new contig if($sequence =~ /\w+/){ #If $sequence isn't empty we need to get the frequencies $a = $sequence =~ s/A/A/g; $t = $sequence =~ s/T/T/g; $g = $sequence =~ s/G/G/g; $c = $sequence =~ s/C/C/g; $n = $sequence =~ s/N/N/g; #Count up the total bases $total = $a + $c + $g + $t; #Get the frequencies by dividing by the total # of bases $afreq = $a/$total; $cfreq = $c/$total; $gfreq = $g/$total; $tfreq = $t/$total; #print the results # print "A: $afreq\nT: $tfreq\nG: $gfreq\nC: $cfreq\n\n"; } print "$line\n"; #Print out the name of the new contig after the old frequencies $sequence = ""; #Reset $sequence for the new contig } } #We're done with the file, so close it # close IN; # Count the number of A's, T's, G's, and C's in $sequence using s///g in # a slightly funky way. Using s///g is a fast way to count characters # in a string. s stands for substitute and hence substitutes the string # in the left slashes for the one in the right slashes, the g indicates # to preform this globally (without g it will only do it once). And it's a # helluva lot easier to code than tearing the whole string into an array # and counting that way (unless you need the sequence in array form # for another reason ...) # # For more exciting information on the powerful s/// operator, see # "Learning Perl," pp 13 and 88. # #Here we are counting the frequencies only for the last contig. #$sequence still has the last contig's sequence stored in it. $a = $sequence =~ s/A/A/g; $t = $sequence =~ s/T/T/g; $g = $sequence =~ s/G/G/g; $c = $sequence =~ s/C/C/g; $n = $sequence =~ s/N/N/g; #Count up the total bases $total = $a + $c + $g + $t; #Get the frequencies by dividing by the total # of bases $afreq = $a/$total; $cfreq = $c/$total; $gfreq = $g/$total; $tfreq = $t/$total; #print the results # print "A: $afreq\nT: $tfreq\nG: $gfreq\nC: $cfreq\n";