#! /usr/bin/perl use strict; #Scott Doniger #January 19, 2005 #WARNING - this file is a lot more complicated than the previous ones. Make sure you're #comfortable with the others before looking at this. Read the section on Hashes first. #ReadFastaFile.pl #This is a simple program to read in a fasta file and parse each sequence with its #name. You will hopefully see how file input and output works, how to parse pieces #of files that are not separated by newlines, and to use hashes. #see the fastafile1.fa to see what the fasta format is. #To use the program enter perl ./ReadFastaFile.pl filename # where filename is a fasta file use constant SEQLENGTH => 10; #We will print out the first SEQLENGTH characters of each sequence. my %sequencehash; #this is a hash that will store sequences and their names. if (scalar (@ARGV) != 1){ #the phrase "scalar (@arrayname)" says "treat the array as a scalar", which in this case means #the number of element in the array ARGV. die "You did not give me a file to work with. Please include the filename\n"; } my $fastafile = shift @ARGV; #@ARGV is the array of the input parameters. Shift grabs the first parameter open FASTA, $fastafile; #we're now creating a file handler called FASTA which will allow us to access the lines of the file. #Now we want to read through the file, and create a hash. The key of the hash will be the sequence name, the value stored #will be the sequence. my $name; my $sequence; $name = ; #pre-read the first name before we get into the loop (this makes the code in the loop easier) $name =~ s/>//; #remove the > $name =~ s/\n//g; #remove any newlines while (my $line = ){ #We know that a fasta file goes name, sequence, name, sequence, name, sequence, ... if ($line =~ />/){ #we're at the start of a new fasta record, so we need to store the previous sequence chomp $line; #remove the ">" $line =~ s/\n//g; #remove newlines. The g at the end means "globally", so all newlines, not just the first are replaced my $firstseq = substr($sequence, 0, SEQLENGTH); #get the first SEQLENGTH characters of the sequence (counting from 0). $sequencehash{$name} = $firstseq; #so here we have a key in the hash of $name. The value that this key points to is #$firstseq. So for every fasta entry we have now created a pair of data, the name, #and its corresponding sequence. #notice that its $sequencehash here, but before it was %sequencehash. This is a syntax thing. You can simply memorize #that when you want the entire hash you use the %sequencehash, but when you want a specific element of the hash its #$sequencehash{$mykey} #ok we stored the previous sequence, lets start over again. $sequence = ""; #this is now reset to an empty string. $name = $line; #reset the name chomp $line; #remove the newline at the end of the name $name =~ s/>//; #substitute > with nothing }else{ #we know that this isn't a new sequence since there is now >, so we append this line to the sequence variable $sequence .= $line; #we grab the next line, which we know is the sequence .= is the same as saying x = x.newstuff } } close FASTA; #ok so we read the file. How do we get into the hash and get back our data? foreach my $k (sort keys %sequencehash){ #foreach access every element of an array. key %hash returns an array that #contains all of the keys of the hash. sort is there to order the keys alphabetically. The tricky thing about hashes #is that the keys are not necessarily stored in the order they were entered. The hash finds its own way to order the keys #allowing it to efficiently access its elements. print "$k => ".$sequencehash{$k}."\n"; }