diff --git a/misc-scripts/chimp/InterimExon.pm b/misc-scripts/chimp/InterimExon.pm index 10671144ea345264c76dbb5884f064c1642a3f59..b851cd96728ec92fc0c79fe3c03464b18424523c 100644 --- a/misc-scripts/chimp/InterimExon.pm +++ b/misc-scripts/chimp/InterimExon.pm @@ -37,11 +37,11 @@ sub is_fatal { foreach my $msg (@{$self->get_all_StatMsgs}) { foreach my $code (@FATAL) { if(($msg->code() & $code) == $code) { - info("Code is Fatal: ". StatMsg::code2str($msg->code())); + #info("Code is Fatal: ". StatMsg::code2str($msg->code())); return 1; } } - info("Code is NON fatal=". StatMsg::code2str($msg->code())); + #info("Code is NON fatal=". StatMsg::code2str($msg->code())); } return 0; @@ -146,7 +146,7 @@ sub fail { if(@_) { my $fail = shift; - warning("Setting ".$self->stable_id." to failed.\n") if($fail); + #warning("Setting ".$self->stable_id." to failed.\n") if($fail); $self->{'fail'} = $fail; } diff --git a/misc-scripts/chimp/StatMsg.pm b/misc-scripts/chimp/StatMsg.pm index 4de4fe045fe27d6d1681e1e655667e0c7cedd7dd..6018ad9c45678f17e14373442db00221e7a64716 100644 --- a/misc-scripts/chimp/StatMsg.pm +++ b/misc-scripts/chimp/StatMsg.pm @@ -42,6 +42,10 @@ use constant MIDDLE => 0x00040000; use constant CONFUSED => 0x00080000; use constant ALL_INTRON => 0x00100000; +use constant TRANSLATES => 0x00200000; +use constant SPLIT => 0x00400000; +use constant NO_SEQUENCE_LEFT => 0x00800000; +use constant PARTIAL => 0x01000000; use vars qw(@EXPORT_OK @ISA); @@ -50,6 +54,10 @@ use vars qw(@EXPORT_OK @ISA); @EXPORT_OK = qw(&push_err &pop_err &ec2str); +our $CUR_ID = 0; +our $LOGGER = undef; + + sub new { my $class = shift; my $code = shift; @@ -58,20 +66,53 @@ sub new { die("Status Code Argument is required.\n"); } - return bless {'code' => $code}, $class; + my $sm = bless {'code' => $code, 'id' => $CUR_ID++}, $class; + + if($LOGGER) { + $LOGGER->add_StatMsg($sm); + } + + return $sm; +} + + +# +# Sets a logger for stat msg logging +# If set, every created StatMsg will be logged to a file +# if undef, nothing will be logged. +# +sub set_logger { + my $logger = shift; + $LOGGER = $logger; +} + + +# +# Unique id is given to each statmsg so that they can be distinguished. +# Sometimes the same statmsg will be given to multiple exons and it is handy +# to be able to tell the messages apart, for example. +# +sub id { + my $self = shift; + return $self->{'id'}; } +# +# The code is a bitvector - a combination of bitwise or'd constants. +# sub code { my $self = shift; return $self->{'code'}; } +# +# Returns a human readible version of what the code of this statmsg means. +# sub code_str { my $self = shift; return code2str($self->{'code'}); } - # # converts a code to a string # @@ -86,6 +127,9 @@ sub code2str { if($code & ENTIRE) { $str .= ' entire'; } + if($code & PARTIAL) { + $str .= ' partial'; + } if($code & LONG) { $str .= ' long'; } @@ -146,6 +190,16 @@ sub code2str { if($code & ALL_INTRON) { $str .= " consumed by frameshift intron"; } + if($code & TRANSLATES) { + $str .= " translates"; + } + if($code & SPLIT) { + $str .= " split"; + } + if($code & NO_SEQUENCE_LEFT) { + $str .= " no sequence left"; + } + return "$str\n"; } diff --git a/misc-scripts/chimp/Transcript.pm b/misc-scripts/chimp/Transcript.pm index 02ff9b45e68818732613c47d99c9c2126f2eabcf..5cd8dd0c6ec47bc9c0126dac9a5d2c7d57f6db3a 100644 --- a/misc-scripts/chimp/Transcript.pm +++ b/misc-scripts/chimp/Transcript.pm @@ -12,23 +12,24 @@ package Transcript; use StatMsg; use InterimTranscript; +use Utils qw(print_exon); use Bio::EnsEMBL::Transcript; use Bio::EnsEMBL::Exon; -use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Exception qw(throw info); # # Sets the phases of the interim exons in an interim transcript # failed exons are skipped completely. # # Note that the phases of the original exon are ignored, so there is a -# possibility of bad translations when the original start exon had a non-zero +# possibility of bad translations when the original start exon had a non-zero # start phase. # sub set_iexon_phases { my $itranscript = shift; - debug("setting exon phases for : " . $itranscript->stable_id()); + info("setting exon phases for : " . $itranscript->stable_id()); my @iexons = @{$itranscript->get_all_Exons()}; @@ -119,7 +120,7 @@ sub check_iexons { my $transcript_seq_region = undef; my $transcript_strand = undef; - debug("checking exons for : " . $itranscript->stable_id()); + info("checking exons for : " . $itranscript->stable_id()); my $first = 1; @@ -133,7 +134,7 @@ sub check_iexons { # when one of them is fatal? # if ($iexon->fail() || $iexon->is_fatal()) { - debug(" failed/fatal exon, splitting transcript"); + info(" failed/fatal exon, splitting transcript"); # split this transcript in two, and restart the processing # at the beginning of the second transcript @@ -170,7 +171,7 @@ sub check_iexons { $transcript_seq_region = $iexon->seq_region(); } elsif ($transcript_seq_region ne $iexon->seq_region()) { - debug(" scaffold span, splitting transcript"); + info(" scaffold span, splitting transcript"); my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT|StatMsg::SCAFFOLD_SPAN); $itranscript->add_StatMsg($stat_msg); @@ -193,7 +194,7 @@ sub check_iexons { (defined($prev_start) && $iexon->strand() == -1 && $prev_start < $iexon->end())) { - debug(" inversion, splitting transcript"); + info(" inversion, splitting transcript"); my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::INVERT); $itranscript->add_StatMsg($stat_msg); @@ -214,7 +215,7 @@ sub check_iexons { if (!defined($transcript_strand)) { $transcript_strand = $iexon->strand(); } elsif ($transcript_strand != $iexon->strand()) { - debug(" strand flip, splitting transcript"); + info(" strand flip, splitting transcript"); my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::STRAND_FLIP); $itranscript->add_StatMsg($stat_msg); @@ -239,7 +240,7 @@ sub check_iexons { if ($total_exons > 0) { push @$itranscript_array, $itranscript; } else { - debug(" no exons left in transcript"); + info(" no exons left in transcript"); } return $itranscript_array; @@ -272,9 +273,9 @@ sub split_itrans { # all the exons until the 'bad exon' are loaded into the first transcript - debug("==FIRST TRANSCRIPT:\n"); + info("==FIRST TRANSCRIPT:\n"); while($cur_exon && $cur_exon != $bad_exon) { - print_exon($cur_exon); + print_exon($cur_exon); push @first_exons, $cur_exon; $first_trans->add_Exon($cur_exon); $cur_exon = shift(@remaining_exons); @@ -284,8 +285,8 @@ sub split_itrans { throw("unexpected: could not find bad exon in transcript"); } - debug("==BAD EXON: ". (($keep_exon) ? 'keeping' : 'discarding')); - print_exon($bad_exon); + info("==BAD EXON: ". (($keep_exon) ? 'keeping' : 'discarding')); + print_exon($bad_exon); # keep the 'bad exon' if the flag was set if($keep_exon) { @@ -294,9 +295,9 @@ sub split_itrans { # the remaining exons are reloaded into the second (original) transcript $itrans->flush_Exons(); - debug("==SECOND TRANSCRIPT:\n"); + info("==SECOND TRANSCRIPT:\n"); foreach my $exon (@remaining_exons) { - print_exon($exon); + print_exon($exon); $itrans->add_Exon($exon); } @@ -337,7 +338,7 @@ sub split_itrans { my $cdna_shift = 0; foreach my $ex (@first_exons) { $cdna_shift += $ex->length(); - # debug("cdna_shift = $cdna_shift\n"); + info("cdna_shift = $cdna_shift\n"); } # if we threw away the 'bad' exon we want to take into account @@ -348,8 +349,7 @@ sub split_itrans { $cdna_shift += $bad_exon->length(); } - # debug("Shifting CDNA of second transcript by $cdna_shift\n"); - # debug("First exon (2nd trans) cdna_start = ".$remaining_exons[0]->cdna_start()."\n"); + info("Shifting CDNA of second transcript by $cdna_shift\n"); if($cdna_shift) { foreach my $ex (@remaining_exons) { @@ -386,7 +386,7 @@ sub make_Transcript { my $transcript = Bio::EnsEMBL::Transcript->new(); $transcript->stable_id($itrans->stable_id); - # debug("making final transcript for ". $itrans->stable_id); + info("making final transcript for ". $itrans->stable_id); my $translation; @@ -412,13 +412,9 @@ sub make_Transcript { -STABLE_ID => $iexon->stable_id(), -SLICE => $slice); - #print STDERR "Adding exon " . $exon->stable_id . " to final transcript\n"; $transcript->add_Exon($exon); - # # see if this exon is the start or end exon of the translation - # - if ($translation) { if ($iexon->cdna_start() <= $itrans->cdna_coding_start() && $iexon->cdna_end() >= $itrans->cdna_coding_start()) { @@ -430,13 +426,6 @@ sub make_Transcript { if ($iexon->cdna_start() <= $itrans->cdna_coding_end() && $iexon->cdna_end() >= $itrans->cdna_coding_end()) { - #debug("end exon=".$exon->stable_id()."\n". - # " ex_coding_start=".$iexon->cdna_start()."\n". - # " ex_coding_end=".$iexon->cdna_end()."\n". - # " start=".$iexon->start()."\n". - # " end=".$iexon->end()."\n". - # " transl_end = ".$itrans->cdna_coding_end()); - my $translation_end = $itrans->cdna_coding_end() - $iexon->cdna_start() + 1; $translation->end_Exon($exon); @@ -465,49 +454,6 @@ sub make_Transcript { return $transcript; } -sub print_exon { - my $exon = shift; - my $tr = shift; - - if (!$exon) { - throw("Exon undefined"); - } - - print STDERR " ",$exon->stable_id, "\n"; - - print STDERR " cdna_start = ",$exon->cdna_start(),"\n" - if(defined($exon->cdna_start())); - - print STDERR " cdna_end = ", $exon->cdna_end(), "\n" - if(defined($exon->cdna_end())); - - print STDERR " start = ". $exon->start() . "\n" - if(defined($exon->start())); - - print STDERR " end = ". $exon->end() . "\n" - if(defined($exon->end())); - - print STDERR " strand = ". $exon->strand() . "\n" - if(defined($exon->strand())); - - if($exon->fail) { - print STDERR " FAILED\n"; - } - - if($tr) { - print STDERR " TRANSCRIPT:\n"; - print STDERR " cdna_coding_start = ". $tr->cdna_coding_start() . "\n"; - print STDERR " cdna_coding_end = ". $tr->cdna_coding_end(). "\n\n"; - } - - return; -} - - -sub debug { - my $msg = shift; - print STDERR $msg, "\n"; -} 1; diff --git a/misc-scripts/chimp/human2chimp.pl b/misc-scripts/chimp/human2chimp.pl index 50072611a0482d49cdc22617573c13fb5aa15fbb..2b00c8cda0aec92469798a0581c0d51e188c0ca8 100644 --- a/misc-scripts/chimp/human2chimp.pl +++ b/misc-scripts/chimp/human2chimp.pl @@ -11,6 +11,10 @@ use StatMsg; use Deletion; use Insertion; use Transcript; +use StatLogger; +use StatMsg; + +use Utils qw(print_exon print_coords print_translation); use Bio::EnsEMBL::Utils::Exception qw(throw info verbose); @@ -20,7 +24,7 @@ use Bio::EnsEMBL::Utils::Exception qw(throw info verbose); my ($hhost, $hdbname, $huser, $hpass, $hport, $hassembly, #human vars $hchromosome, $hstart, $hend, $chost, $cdbname, $cuser, $cpass, $cport, $cassembly, #chimp vars - $help, $verbose); + $help, $verbose, $logfile); GetOptions('hhost=s' => \$hhost, 'hdbname=s' => \$hdbname, @@ -37,6 +41,7 @@ use Bio::EnsEMBL::Utils::Exception qw(throw info verbose); 'cpass=s' => \$cpass, 'cport=i' => \$cport, 'cassembly=s' => \$cassembly, + 'logfile=s' => \$logfile, 'help' => \$help, 'verbose' => \$verbose); @@ -74,6 +79,8 @@ use Bio::EnsEMBL::Utils::Exception qw(throw info verbose); -port => $cport); + StatMsg::set_logger(StatLogger->new($logfile)); + $human_db->dnadb($chimp_db); my $slice_adaptor = $human_db->get_SliceAdaptor(); @@ -109,6 +116,8 @@ use Bio::EnsEMBL::Utils::Exception qw(throw info verbose); my $mapper = $asmap_adaptor->fetch_by_CoordSystems($chimp_cs, $human_cs); + my $total_transcripts = 0; + foreach my $slice (@$slices) { info("Chromosome: " . $slice->seq_region_name()); @@ -123,69 +132,80 @@ use Bio::EnsEMBL::Utils::Exception qw(throw info verbose); foreach my $transcript (@$transcripts) { next if(!$transcript->translation); #skip pseudo genes + print ++$total_transcripts, "\n"; + my $interim_transcript = transfer_transcript($transcript, $mapper, $human_cs); my $finished_transcripts = create_transcripts($interim_transcript, $slice_adaptor); + my $transcript_count = @$finished_transcripts; + my $translation_count = 0; + my $stop_codons_count = 0; + + if($transcript_count > 1) { + StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::SPLIT); + } + elsif($transcript_count== 0) { + StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::NO_SEQUENCE_LEFT); + } + foreach my $ftrans (@$finished_transcripts) { if($ftrans->translation()) { + $translation_count++; my $pep = $ftrans->translate->seq(); + print STDERR "\n\n$pep\n\n"; + if($pep =~ /\*/) { + $stop_codons_count++; + } + # sanity check, if translation is defined we expect a peptide if(!$pep) { - dump_translation($ftrans->translation()); + print_translation($ftrans->translation()); throw("Unexpected Translation but no peptide"); } } else { print STDERR "NO TRANSLATION LEFT\n"; } } - } - } - } -} - -sub dump_translation { - my $tl = shift; - - print STDERR "TRANSLATION\n"; - if(!$tl) { - print STDERR " undef\n"; - return; - } - - if($tl->start_Exon()) { - print STDERR " start exon = ", $tl->start_Exon->stable_id, "\n"; - } else { - print STDERR " start exon = undef\n"; - } - - if($tl->end_Exon()) { - print STDERR " end exon = ", $tl->end_Exon->stable_id, "\n"; - } else { - print STDERR " end exon = undef\n"; - } + # If there were stop codons in one of the split transcripts + # report it. Report it as 'entire' if all split transcripts had + # stops. + if($stop_codons_count) { + my $code = StatMsg::TRANSCRIPT | StatMsg::DOESNT_TRANSLATE; + if($stop_codons_count == $translation_count) { + $code |= StatMsg::ENTIRE; + } else { + $code |= StatMsg::PARTIAL; + } + StatMsg->new($code); + } - if(defined($tl->start())) { - print STDERR " start = ", $tl->start(), "\n"; - } else { - print STDERR " start = undef\n"; - } + if(!$translation_count) { + StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::NO_CDS_LEFT); + } - if(defined($tl->end())) { - print STDERR " end = ", $tl->end(), "\n"; - } else { - print STDERR " end = undef\n"; + if($translation_count) { + if($stop_codons_count) { + if($translation_count > $stop_codons_count) { + StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::TRANSLATES | + StatMsg::PARTIAL); + } + } else { + StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::TRANSLATES | + StatMsg::ENTIRE); + } + } + } + } } - - - return; } + ############################################################################### # # transfer_transcript @@ -372,6 +392,8 @@ sub get_coords_extent { my $stat_code = StatMsg::EXON; + #print_coords($coords); + foreach my $c (@$coords) { next if($c->isa('Bio::EnsEMBL::Mapper::Gap')); @@ -522,3 +544,5 @@ EOF } + + diff --git a/misc-scripts/chimp/run_hum2chimp.sh b/misc-scripts/chimp/run_hum2chimp.sh index 398cc1c72cadd3b8783442971d1e43f56470b842..89f4cad42b883d4f065280205e387869d484c5c5 100755 --- a/misc-scripts/chimp/run_hum2chimp.sh +++ b/misc-scripts/chimp/run_hum2chimp.sh @@ -1,3 +1,3 @@ #!/bin/sh -/usr/local/ensembl/bin/perl human2chimp.pl -hdbname homo_sapiens_core_20_34b -hhost ecs4 -huser ensro -hport 3351 -cdbname mcvicker_chimp_human_merge -chost ecs4 -cport 3350 -cuser ensro -verbose -hassembly NCBI34 -cassembly BROAD1 +/usr/local/ensembl/bin/perl human2chimp.pl -hdbname homo_sapiens_core_20_34b -hhost ecs4 -huser ensro -hport 3351 -cdbname mcvicker_chimp_human_merge -chost ecs4 -cport 3350 -cuser ensro -hassembly NCBI34 -cassembly BROAD1 -hchromosome 9 -logfile chimp.log