diff --git a/scripts/Analysis/find_supporting_evidence b/scripts/Analysis/find_supporting_evidence new file mode 100755 index 0000000000000000000000000000000000000000..1ac3f9e813218b28097aab28a0a20573a982df7b --- /dev/null +++ b/scripts/Analysis/find_supporting_evidence @@ -0,0 +1,140 @@ +#!/usr/local/bin/perl + +BEGIN { + unshift (@INC,"/nfs/disk89/michele/bioperl-live"); + unshift (@INC,"/nfs/disk89/michele/pogdir/ensembl/modules"); +} + +# Let the code begin... + + +use Bio::EnsEMBL::DBSQL::Obj; +use Bio::EnsEMBL::TimDB::Obj; +use Bio::EnsEMBL::DB::ContigI; + +use Getopt::Long; +use strict; + +my ($dbname, + $dbhost, + $dbuser, + $dbpass, + $clonefile, + ); + +$|=1; + +GetOptions("dbname=s" => \$dbname, + "dbhost=s" => \$dbhost, + "dbuser=s" => \$dbuser, + "dbpass=s" => \$dbpass, + "clonefile=s" => \$clonefile, + ) or usage(); + +$dbname = 'ensembl' unless $dbname; +$dbhost = 'obi-wan' unless $dbhost; +$dbuser = 'ens-ro' unless $dbuser; +$dbpass = '' unless $dbpass; +# Connect to the database + +my $db = Bio::EnsEMBL::DBSQL::Obj->new(-dbname => $dbname, + -host => $dbhost, + -user => $dbuser, + -pass => $dbpass, + ) or die "Can't connect to database"; + +my $db2 = Bio::EnsEMBL::DBSQL::Obj->new(-dbname => 'ensembl', + -host => $dbhost, + -user => $dbuser, + -pass => $dbpass, + ) or die "Can't connect to database"; + +my @clones = get_clones($clonefile); + + +foreach my $cloneid (@clones) { + eval { + print(STDERR " - Processing clone $cloneid\n"); + my $clone = $db->get_Clone($cloneid); + print(STDERR " - getting genes for clone $cloneid\n"); + + my @features; + + print(STDERR " - getting contigs\n"); + + foreach my $contig ($clone->get_all_Contigs) { + push(@features,$contig->get_all_SimilarityFeatures); + } + + print(STDERR " - getting genes\n"); + my @genes = $clone->get_all_Genes; + print(STDERR " - done\n"); + + foreach my $gene (@genes) { + + print(STDERR " - processing gene " . $gene->id . "\n"); + + foreach my $exon ($gene->each_unique_Exon) { + + my @homols; + + print(STDERR " - processing exon " . $exon->id . "\n"); + + $exon ->find_supporting_evidence (\@features); + print(STDERR " - found supporting evidence\n"); + $db2 ->write_supporting_evidence($exon); + } + } + }; + if ($@) { + print(STDERR "Error processing clone $cloneid. Skipping clone"); + } +} + +############################################################ +# +# Subroutines. +# +# This are inserted mostly just for tidiness +# +############################################################ + +sub usage { + print("\nUsage: featureparser_test2 <clone> <contig_number> <accession>\n"); + exit(0); +} + +sub get_clones { + my ($clonefile) = @_; + + open(FILE,"<$clonefile") || die "Can't open clone file $clonefile"; + + my @clones; + while (<FILE>) { + chomp; + push(@clones,$_); + } + + return @clones; +} + +sub homol2gff { + my ($fh,$homols,$exon) = @_; + + foreach my $h1 (@$homols) { + my $h2 = $h1->feature2; + my $strand = "+"; + + if ($h2->strand == -1) { + $strand = "-"; + } + + print($fh $exon . "\t" . $h1->source_tag . "\t" . $h2->primary_tag ."\t" . $h1->start . "\t" . $h1->end . "\t" . $h1->score ."\t" . $strand . + "\t0\tSequence:" . + $h2->seqname . "\t" . $h2->start . "\t" . $h2->end ."\n"); + } +} + + + + diff --git a/scripts/Analysis/supporting_evidence_test b/scripts/Analysis/supporting_evidence_test new file mode 100755 index 0000000000000000000000000000000000000000..9839c9c4af1739a9038c831bfac156b667ac288b --- /dev/null +++ b/scripts/Analysis/supporting_evidence_test @@ -0,0 +1,114 @@ +#!/usr/local/bin/perl + +BEGIN { + unshift (@INC,"/nfs/disk89/michele/bioperl-live"); + unshift (@INC,"/nfs/disk89/michele/pogdir/ensembl/modules"); +} + +# Let the code begin... + + +use Bio::EnsEMBL::DBSQL::Obj; +use Bio::EnsEMBL::TimDB::Obj; +use Bio::EnsEMBL::DB::ContigI; + +use Getopt::Long; +use strict; + +my ($cloneid, +); + +$|=1; + +GetOptions("clone=s" => \$cloneid, # Clone name e.g. dJ998N21 + ) or usage(); + +# Connect to the database + +my $db = Bio::EnsEMBL::DBSQL::Obj->new(-dbname => 'pogtest', + -host => 'obi-wan', + -user => 'ensloader', + ) or die "Can't connect to database"; +my @clones; +push(@clones,$cloneid); + +my $clone = $db->get_Clone($cloneid); +print(" - getting genes for clone $cloneid\n"); + +open(POG,">" . $clone->id . ".gff"); + + +my @homols; +my @features; + +print(" - getting contigs\n"); + +foreach my $contig ($clone->get_all_Contigs) { + push(@features,$contig->get_all_SimilarityFeatures); +} + +print(" - getting genes\n"); +my @genes = $clone->get_all_Genes; +print(" - done\n"); + +foreach my $gene (@genes) { + + print(" - processing gene " . $gene->id . "\n"); + + foreach my $exon ($gene->each_unique_Exon) { + my @homols; + print(" - processing exon " . $exon->id . "\n"); + $exon ->find_supporting_evidence (\@features); + print(" - found supporting evidence\n"); + $db ->write_supporting_evidence($exon); + + $exon->{_supporting_evidence} = []; + print("supporting features are [" . $exon->each_Supporting_Feature . "]\n"); + print(" - written supporting evidence\n"); + $db ->get_supporting_evidence ($exon); + print(" - got supporting evidence\n"); + push(@homols,$exon->each_Supporting_Feature); + homol2gff(\*POG,\@homols,$exon->id); # Prints out gff. + } + } + + + + +close(POG); + + +############################################################ +# +# Subroutines. +# +# This are inserted mostly just for tidiness +# +############################################################ + +sub usage { + print("\nUsage: featureparser_test2 <clone> <contig_number> <accession>\n"); + exit(0); +} + + +sub homol2gff { + my ($fh,$homols,$exon) = @_; + + foreach my $h1 (@$homols) { + my $h2 = $h1->feature2; + my $strand = "+"; + + if ($h2->strand == -1) { + $strand = "-"; + } + + print($fh $exon . "\t" . $h1->source_tag . "\t" . $h2->primary_tag ."\t" . $h1->start . "\t" . $h1->end . "\t" . $h1->score ."\t" . $strand . + "\t0\tSequence:" . + $h2->seqname . "\t" . $h2->start . "\t" . $h2->end ."\n"); + } +} + + + + diff --git a/scripts/Analysis/testHMM b/scripts/Analysis/testHMM new file mode 100755 index 0000000000000000000000000000000000000000..a85e058c7293dac0eba55f9adb57f7f6a8798b0e --- /dev/null +++ b/scripts/Analysis/testHMM @@ -0,0 +1,100 @@ +#!/usr/local/bin/perl + +BEGIN { + unshift (@INC,"/nfs/disk89/michele/bioperl-live"); + unshift (@INC,"/nfs/disk89/michele/pogdir/ensembl/modules"); +} + +# Let the code begin... + + +use Bio::EnsEMBL::Analysis::Genscan; +use Bio::EnsEMBL::Analysis::GenscanPeptide; +use Bio::Tools::HMMER::Results; + +use Getopt::Long; +use strict; + +my ($clone, + $contig, +); + + +GetOptions("clone=s" => \$clone, # Clone name e.g. dJ998N21 + "contig=s" => \$contig, # Contig number e.g. 00577 + ) or usage(); + +# Initialize directories and clone/contig names + +my $root_dir = "/nfs/disk100/humpub/th/unfinished_ana/data/$clone/"; + +my $gsfile = $root_dir . "/" . $clone . "." . $contig . ".gs"; +my $dnafile = $root_dir . "/" . $clone . ".seq"; + + +open(POG,">$clone.$contig.gff"); # This is for gff output + +my $fh = \*POG; +my $dna = find_seq($dnafile,$clone,$contig) or die "Couldn't find dna for $clone.$contig"; +my $gs = new Bio::EnsEMBL::Analysis::Genscan($gsfile,$dna); + +my $count = 0; + +TRANS: foreach my $g ($gs->each_Transcript) { + + $count++; + + print_gene($fh,$g); # Just for debugging purposes + + my $genpep = new Bio::EnsEMBL::Analysis::GenscanPeptide($g); + + my $pfamfile = $root_dir . "/" . $clone . "." . $contig . ".$count.hmmpfam_frag"; + print("Pfamfile is $pfamfile\n"); + + my $res = new Bio::Tools::HMMER::Results( -file => "$pfamfile", + -type => 'hmmpfam'); + + print("Made object\n"); + # print out the results for each sequence + foreach my $seq ( $res->each_Set ) { + print("here\n"); + print "Sequence bit score is",$seq->bits,"\n"; + foreach my $domain ( $seq->each_Domain ) { + print " Domain start ",$domain->start," end ",$domain->end," score ",$domain->bits,"\n"; + } + } + + # new result object on a sequence/domain cutoff of 25 bits sequence, 15 bits domain +# my $newresult = $res->filter_on_cutoff(25,15); + + # alternative way of getting out all domains directly + foreach my $domain ( $res->each_Domain ) { + print "Domain on ",$domain->seqname," with score ",$domain->bits," evalue ",$domain->evalue,"\n"; + } +} +sub find_seq { + my ($dnafile,$clone,$contig) = @_; + + my $seqio = new Bio::SeqIO(-file => $dnafile, + -format => 'Fasta'); + + my $dna; + + while (my $d = $seqio->next_seq) { + if ($d->id eq "$clone.$contig") { + $dna = $d; + } + } + return $dna; +} +sub print_gene { + my ($fh,$g) = @_; + + foreach my $ex ($g->each_Exon) { + print($fh "Gene\tGENSCAN\texon\t" . $ex->start . "\t" . $ex->end . "\t0\t" . $ex->strand . "\t0\n"); + } + + print("\nTranslation " . $g->translate()->seq . "\n"); +} + +