#!/usr/bin/perl $usage .= "$0 - look up GFF intersect tags\n"; $usage .= "\n"; $usage .= "Usage: $0 [-self] [-image] [-unique] [-evalfile expr] [-quiet] [-showtogether] [-maxscore|-minscore] [-first|-maxlen|-minpairdistance|-maxsort expr] []\n"; $usage .= "\n"; $usage .= "group field in should have intersect tags of the form \"intersect(filename)=(1 5 6 19)\"\n"; $usage .= " ... this will print lines 1, 5, 6 and 19 of \n"; $usage .= "\n"; $usage .= "Use the -evalfile switch to perform operations (eg substitutions) on before printing;\n"; $usage .= "one of these expressions must evaluate to 1 or the intersect tag will be discarded,\n"; $usage .= "so -evalfile can also be used for tests.\n"; $usage .= "\n"; $usage .= "Use the -self switch in conjunction with \"gffintersect.pl -self\" (equivalent to \"-quiet -evalfile s/^self&//\")\n"; $usage .= "\n"; $usage .= "Use the -image switch to find the \"image\" of 's GFFs, assuming this is a GFF-pair\n"; $usage .= "\n"; $usage .= "Use the -quiet switch to suppress the addition of intersect()=() information to the output\n"; $usage .= "\n"; $usage .= "Use the -showtogether switch to show the input GFF lines as well (some lines may appear more than once)\n"; $usage .= "\n"; $usage .= "Use the -maxscore switch to only print the highest-scoring hit for a line,\n"; $usage .= "Use -minscore to only print the lowest-scoring hit for a line,\n"; $usage .= "Use -first to only print the max-starting-position hit for a line,\n"; $usage .= "Use -maxlen to only print the longest hit for a line,\n"; $usage .= "Use -minpairdistance to only print the closest-pair hit for a line,\n"; $usage .= "Use the more generic -maxsort to specify a sorting criterion, with \@a and \@b holding the GFF fields\n"; $usage .= " e.g. -maxscore is equivalent to \"-maxsort '\$b[5]<=>\$a[5]'\"\n"; $usage .= "Use -unique to entirely reject intersect sets larger than one\n"; $usage .= "\n"; $usage .= "EXAMPLE: How to prune a GFF file of the lowest-scoring redundant entries:\n"; $usage .= " gffintersect.pl -self | intersectlookup.pl -self -maxscore\n"; $usage .= "\n"; while (@ARGV) { last unless $ARGV[0] =~ /^-/; $opt = lc shift; if ($opt eq "-evalfile") { defined($evalfile = shift) or die $usage; push @evalfile, $evalfile } elsif ($opt eq "-self") { push @evalfile, "s/^self&//"; $quiet = 1; $self = 1 } elsif ($opt eq "-image") { $image = 1 } elsif ($opt eq "-unique") { $unique = 1 } elsif ($opt eq "-first") { $sort = "bystartpos" } elsif ($opt eq "-maxscore") { $sort = "bymaxscore" } elsif ($opt eq "-minscore") { $sort = "byminscore" } elsif ($opt eq "-maxlen") { $sort = "bylen" } elsif ($opt eq "-mergeset") { $mergeSet = 1 } elsif ($opt eq "-minpairdistance") { $sort = "bypairdistance" } elsif ($opt eq "-maxsort") { defined($sortexpr = shift) or die $usage; $sort = "byexpr" } elsif ($opt eq "-quiet") { $quiet = 1 } elsif ($opt eq "-showtogether") { $showtogether = 1 } elsif ($opt eq "-h") {die "$usage\n"} else { die "$usage\nUnknown option: $opt\n" } } @ARGV>=1 or @ARGV = ("-"); foreach $file1 (@ARGV) { open file1, $file1 or die "$0: couldn't open $file1: $!"; $line1 = 0; while () { ++$line1; $lineText{$line1} = $_; @f = split /\t/; my @pair = split(/\s+/,$f[8]); if ($f[6] eq "-") { @pair[1,2] = @pair[2,1] } $set = undef; while ((($file2,$lines2) = ($f[8] =~ /intersect$([^$]+)\)=$([^$]+)\)/)) && $file2 ne "") { $f8sub = quotemeta "intersect($file2)=($lines2)"; # Backslash all non-alphanumeric characters $f[8] =~ s/$f8sub//; if (@evalfile) { $_ = $file2; $print = 0; foreach $evalfile (@evalfile) { #warn "Evaluating $evalfile on $_\n"; if (eval $evalfile) { $print = 1 } } next unless $print; $file2 = $_; } next unless -r $file2; #$tmpf8 = $f[8]; chomp $tmpf8; warn "file1=$file1 line1=$line1 file2=$file2 lines2=($lines2) \$f[8]='$tmpf8'\n"; @line2 = split /\s+/, $lines2; foreach $line2 (@line2) { # If the unique flag is specified and intersect set is larger than 1, set the tainted flag. if ($unique && @line2 > 1) { $tainted{$file2}->{$line2} = 1 } else { # Keep track of the relation from each member of the intersect set to line1 push @{$backlookup{$file2}->{$line2}->{$file1}}, $line1; # If flag is set, show input GFF lines too if ($showtogether) { $keep{$file1}->{$line1} = $_ } if ($image) { $transform{$file1}->{$line1} = [@f[0,3,4],@pair[0..2]] } } } if (defined $sort || $mergeSet) { # Initially, %set, $set, $sets,and $thisset are all undefined. # $thisset = contains the intersection set number for line2 # %set = hash that relates each line2 to an intersection set # $sets = stores the number of intersection sets found foreach $line2 (@line2) { $thisset = $set{$file2}->{$line2}; $file2line2 = "$file2\t$line2"; # Note that the $set scalar will always be undefined # on the initial pass thru this loop. if (defined $set) { if (defined $thisset) { if ($thisset != $set) { foreach (@{$members[$thisset]}) { ($thisfile2,$thisline2) = split /\t/; $set{$thisfile2}->{$thisline2} = $set; } undef $members[$thisset]; } } else { $set{$file2}->{$line2} = $set; undef $gfftext[$set]->{$file2line2}; } } else { if (defined $thisset) { $set = $thisset; } else { $set = $sets++; $set{$file2}->{$line2} = $set; $gfftext[$set]->{$file2line2} = undef; # mark a place in this hash } } } } } # while ($file2, $line2) } # while () close file1; } # Merge intersect set lines into one line per set. if ($mergeSet && $self) { @file2list = keys %set; #print "file2list=[@file2list]\n"; $file2 = $file2list[0]; @line2list = keys %{$set{$file2}}; #print "line2list=[@line2list]\n"; @set2list = values %{$set{$file2}}; #print "set2list=[@set2list]\n"; $pSetNum = undef; foreach $setNum (sort bynum @set2list) { if (defined($pSetNum) && $setNum == $pSetNum) { next;} $minStart = undef; $maxEnd = undef; foreach $lineNum (@line2list) { if ($set{$file2}->{$lineNum} == $setNum) { $line = $lineText{$lineNum}; @fields = split /\t/, $line; $start = $fields[3]; $end = $fields[4]; $name = $fields[0]; $strand = $fields[6]; if (defined($minStart)) { if ($start < $minStart) {$minStart = $start;} } else {$minStart = $start;} if (defined($maxEnd)) { if ($end > $maxEnd) {$maxEnd = $end;} } else {$maxEnd = $end;} } } print "$name\t.\t.\t$minStart\t$maxEnd\t.\t$strand\t.\t$setNum\n"; $pSetNum = $setNum; } exit; } while (($file2,$line2ref) = each %backlookup) { unless (open file2, $file2) { warn "$0: couldn't open $file2: $!\n"; next; } @line2 = sort {$a<=>$b} keys %$line2ref; # sort in ascending order $line2 = 0; while (@line2 && defined($_ = )) { if (++$line2 == $line2[0] && !$tainted{$file2}->{shift @line2}) { if (defined $sort) { $set = $set{$file2}->{$line2}; @f = split /\t/; $gfftext[$set]->{"$file2\t$line2"} = $_; if (++$count[$set] == keys %{$gfftext[$set]}) { %member = (); while (($member,$gfftext) = each %{$gfftext[$set]}) { $member{$gfftext} = $member } #warn "Set $set:\n"; #while (($member,$gfftext) = each %{$gfftext[$set]}) { warn " member=\"$member\" gfftext=$gfftext" } $maxgfftext = (sort $sort keys %member)[0]; #warn " maxmember=\"$member{$maxgfftext}\"\n"; ($maxfile2,$maxline2) = split /\t/, $member{$maxgfftext}; show($maxfile2,$maxline2,$maxgfftext); undef $gfftext[$set]; } } else { show($file2,$line2,$_); } } } close file2; } sub show { my ($file2,$line2,$gfftext) = @_; my $fileref = $backlookup{$file2}->{$line2}; if ($showtogether) { while (($file1,$lineref1) = each %$fileref) { foreach $line1 (@$lineref1) { print $keep{$file1}->{$line1} } } } my @f = split /\t/, $gfftext; chomp $f[8]; if ($image) { while (($file1,$lineref1) = each %$fileref) { my $line1; foreach $line1 (@$lineref1) { my @transform = @{$transform{$file1}->{$line1}}; my $dir = $transform[4] > $transform[5] ? -1 : +1; # print "file1=$file1 line1=$line1 transform=(@transform) f=(@f)\n"; my $start = ($f[3] - $transform[1]) * $dir + $transform[4]; my $end = ($f[4] - $transform[1]) * $dir + $transform[4]; my $strand = $f[6]; if ($start > $end) { ($start,$end) = ($end,$start); if ($strand eq "+") { $strand = "-" } elsif ($strand eq "-") { $strand = "+" } } print join("\t",$transform[3],$f[1],"$f[2]-homology",$start,$end,$f[5],$strand,@f[7,8]); unless ($quiet) { print " original($file2)=$line2" } print "\n"; } } } elsif ($quiet) { print $gfftext; } else { while (($file1,$lineref1) = each %$fileref) { $f[8] .= ' ' unless $f[8] =~ / $/; $f[8] .= "intersect($file1)=(@$lineref1)"; } print join("\t",@f)."\n"; } if ($showtogether) { print "\n" } } sub bystartpos { my @a = split /\t/, $a; my @b = split /\t/, $b; $b[3] <=> $a[3]; } sub bymaxscore { my @a = split /\t/, $a; my @b = split /\t/, $b; $b[5] <=> $a[5]; # descending order } sub byminscore { my @a = split /\t/, $a; my @b = split /\t/, $b; $a[5] <=> $b[5]; # ascending order } sub bylen { my @a = split /\t/, $a; my @b = split /\t/, $b; $b[4]-$b[3] <=> $a[4]-$a[3]; } sub bypairdistance { my @a = split /\t/, $a; my @b = split /\t/, $b; my ($apair) = ($a[8] =~ /(\d+)/); my ($bpair) = ($b[8] =~ /(\d+)/); abs($apair-$a[3]) <=> abs($bpair-$b[3]); } sub byexpr { my @a = split /\t/, $a; my @b = split /\t/, $b; eval $sortexpr; } sub bynum { $a <=> $b }