#!/usr/bin/perl ############################################################################# # # General program for calculating LD given a list of positions and # loci types # For part 2 of assignment 6, Genomics # # perl cal_LD.pl snp_list seq_file OUTFILE # looks like: # 14 a t # 19 a g # 25 t c # 50 t g # 53 g c # 60 a t # 80 t c # contains every sequence on 1 line # # # a file containing LD values with chi-square values # # ############################################################################## if($#ARGV != 2) {print "perl cal_LD.pl snp_list seq_file OUTFILE\n";exit(0);} #take position as is. ie, start from 0 $n=0; open(IN,"$ARGV[0]") || die("cannot open $ARGV[0]\n"); while($line=) { chomp($line); if($line =~ m/^[1234567890]/) { @temp=split("\t",$line); push @{$snp_info[$n]}, @temp; $n++; } } close(IN); #check snp_info #for ($i=0;$i<$#snp_info+1;$i++) { # for($j=0;$j<$#{$snp_info[0]}+1;$j++) # { # print $snp_info[$i][$j]."\t"; # } # print "\n"; #} #take the sequence file and extract the given position open(IN,"$ARGV[1]") || die ("cannot open $ARGV[1]\n"); while($line=) { chomp($line); push @data, $line; } close(IN); for($i=0;$i<$#data+1;$i++) { for($j=0;$j<$#snp_info+1;$j++) { push @{$snp[$i]}, substr($data[$i],$snp_info[$j][0],1); } } ########################################## #test print seq #open(OUT, ">>$ARGV[2]")||die("cannot open $ARGV[2]\n"); #for ($i=0;$i<$#snp+1 ;$i++) { # for($j=0;$j<$#{$snp[0]}+1;$j++) # { # print $snp[$i][$j]; # } #print "\n"; #} ########################################### $n11=0;$n12=0;$n21=0;$n22=0; for($i=0;$i<$#snp_info+1;$i++) { for($j=$i+1;$j<$#snp_info+1; $j++) { #calc observed freq for($k=0;$k<$#snp+1;$k++) { if($snp[$k][$i] eq $snp_info[$i][1] && $snp[$k][$j] eq $snp_info[$j][1]) { $n11++; } elsif($snp[$k][$i] eq $snp_info[$i][1] && $snp[$k][$j] eq $snp_info[$j][2]) { $n12++; } elsif($snp[$k][$i] eq $snp_info[$i][2] && $snp[$k][$j] eq $snp_info[$j][1]) { $n21++; } else { $n22++; } } # print "snp1=$snp_info[$i][0]\tsnp2=$snp_info[$j][0]\n"; # print "$snp_info[$i][1]\t$snp_info[$j][1]\t=$n11\n"; # print "$snp_info[$i][1]\t$snp_info[$j][2]\t=$n12\n"; # print "$snp_info[$i][2]\t$snp_info[$j][1]\t=$n21\n"; # print "$snp_info[$i][2]\t$snp_info[$j][2]\t=$n22\n\n"; $tot=$#data+1; $LD_matrix[$i][$j]=($n11/$tot)-($n11+$n21)*($n11+$n12)/($tot*$tot); $row1_tot=$n11+$n12; $row2_tot=$n21+$n22; $col1_tot=$n11+$n21; $col2_tot=$n12+$n22; $exp11=$row1_tot*$col1_tot/$tot; $exp12=$row1_tot*$col2_tot/$tot; $exp21=$row2_tot*$col1_tot/$tot; $exp22=$row2_tot*$col2_tot/$tot; $p_matrix[$i][$j]=($n11-$exp11)**2/$exp11 + ($n12-$exp12)**2/$exp12 + ($n21-$exp21)**2 / $exp21 + ($n22-$exp22)**2 / $exp22; $n11=0;$n12=0;$n21=0;$n22=0; } } open(OUT, ">>$ARGV[2]")||die("cannot open $ARGV[2]\n"); ## #print LD_matrix ## print OUT "\t"; for ($i=0;$i<$#snp_info+1;$i++) { print OUT $snp_info[$i][0]."\t"; } print OUT "\n"; for($i=0;$i<$#snp_info+1;$i++) {print OUT $snp_info[$i][0]."\t"; for($j=0;$j<$#snp_info+1;$j++) { print OUT $LD_matrix[$i][$j]."\t"; } print OUT "\n"; } ## #print p_matrix ## print OUT "\n\n\t"; for ($i=0;$i<$#snp_info+1;$i++) { print OUT $snp_info[$i][0]."\t"; } print OUT "\n"; for($i=0;$i<$#snp_info+1;$i++) { print OUT $snp_info[$i][0]."\t"; for($j=0;$j<$#snp_info+1;$j++) { print OUT sprintf "%.2f\t", $p_matrix[$i][$j]; } print OUT "\n"; } close(OUT);