From 50b401374da8702572a848d56878481e4cd12151 Mon Sep 17 00:00:00 2001 From: Steve Trevanion <st3@sanger.ac.uk> Date: Tue, 19 Feb 2008 15:49:49 +0000 Subject: [PATCH] merge from vega-46-dev --- .../Bio/EnsEMBL/Utils/ConversionSupport.pm | 139 ++++++-- .../EnsEMBL/Utils/VegaCuration/Transcript.pm | 306 ++++++++++++++++++ .../EnsEMBL/Utils/VegaCuration/Translation.pm | 132 +++++++- 3 files changed, 542 insertions(+), 35 deletions(-) diff --git a/modules/Bio/EnsEMBL/Utils/ConversionSupport.pm b/modules/Bio/EnsEMBL/Utils/ConversionSupport.pm index 54d1b33f51..a4edca4754 100644 --- a/modules/Bio/EnsEMBL/Utils/ConversionSupport.pm +++ b/modules/Bio/EnsEMBL/Utils/ConversionSupport.pm @@ -8,7 +8,7 @@ schema conversion scripts =head1 SYNOPSIS my $serverroot = '/path/to/ensembl'; - my $suport = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot); + my $support = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot); # parse common options $support->parse_common_options; @@ -34,7 +34,8 @@ Please see http://www.ensembl.org/code_licence.html for details =head1 AUTHOR -Patrick Meidl <pm2@sanger.ac.uk> +Steve Trevanion <st3@sanger.ac.uk +Patrick Meidl <meidl@ebi.ac.uk> =head1 CONTACT @@ -235,9 +236,9 @@ sub get_common_params { ); } -=head2 get_lutre_params +=head2 get_loutre_params - Example : my @allowed_params = $self->get_lutre_params, 'extra_param'; + Example : my @allowed_params = $self->get_loutre_params, 'extra_param'; Description : Returns a list of commonly used parameters in for working with a loutre db Return type : Array - list of common parameters Exceptions : none @@ -255,6 +256,23 @@ sub get_loutre_params { ); } +=head2 remove_vega_params + + Example : $support->remove_vega_params; + Description : Removes Vega db conection parameters. Usefull to avoid clutter in log files when + working exclusively with loutre + Return type : none + Exceptions : none + Caller : general + +=cut + +sub remove_vega_params { + my $self = shift; + foreach my $param (qw(dbname host port user pass)) { + $self->{'_param'}{$param} = undef; + } +} =head2 confirm_params @@ -298,14 +316,14 @@ sub confirm_params { sub list_all_params { my $self = shift; - my $txt = sprintf " %-20s%-40s\n", qw(PARAMETER VALUE); - $txt .= " " . "-"x70 . "\n"; + my $txt = sprintf " %-21s%-40s\n", qw(PARAMETER VALUE); + $txt .= " " . "-"x71 . "\n"; $Text::Wrap::colums = 72; my @params = $self->allowed_params; foreach my $key (@params) { my @vals = $self->param($key); if (@vals) { - $txt .= Text::Wrap::wrap( sprintf(' %-20s', $key), + $txt .= Text::Wrap::wrap( sprintf(' %-21s', $key), ' 'x24, join(", ", @vals) ) . "\n"; @@ -671,7 +689,7 @@ sub get_database { -dbname => $self->param("${prefix}dbname"), -group => $database, ); - + # explicitely set the dnadb to itself - by default the Registry assumes # a group 'core' for this now $dba->dnadb($dba); @@ -821,14 +839,15 @@ sub dynamic_use { my ($self, $classname) = @_; my ($parent_namespace, $module) = $classname =~/^(.*::)(.*)$/ ? ($1,$2) : ('::', $classname); + no strict 'refs'; # return if module has already been imported - return 1 if $parent_namespace->{$module.'::'}; + return 1 if $parent_namespace->{$module.'::'} && %{ $parent_namespace->{$module.'::'}||{} }; eval "require $classname"; throw("Failed to require $classname: $@") if ($@); $classname->import(); - + return 1; } @@ -1464,7 +1483,7 @@ sub commify { Arg[3] : string $coord_system_name (optional) - 'chromosome' by default Arg[4] : string $coord_system_version (optional) - 'otter' by default Example : $chroms = $support->fetch_non_hidden_slice($sa); - Description : retrieve all slices from a lutra database that don't have a hidden attribute + Description : retrieve all slices from a loutra database that don't have a hidden attribute Return type : arrayref Caller : general Status : stable @@ -1475,16 +1494,78 @@ sub fetch_non_hidden_slices { my $self = shift; my $aa = shift or throw("You must supply an attribute adaptor"); my $sa = shift or throw("You must supply a slice adaptor"); - my $cs = shift || 'chromosome'; - my $cv = shift || 'Otter'; + my $cs = shift || 'chromosome'; + my $cv = shift || 'Otter'; my $visible_chroms; foreach my $chrom ( @{$sa->fetch_all($cs,$cv)} ) { - my $attribs = $aa->fetch_all_by_Slice($chrom); - push @$visible_chroms, $chrom if @{$self->get_attrib_values($attribs,'hidden',0)}; + my $chrom_name = $chrom->name; + my $attribs = $aa->fetch_all_by_Slice($chrom,'hidden'); + if ( scalar(@$attribs) > 1 ) { + $self->log_warning("More than one hidden attribute for chromosome $chrom_name\n"); + } + elsif ($attribs->[0]->value == 0) { + push @$visible_chroms, $chrom; + } + elsif ($attribs->[0]->value == 1) { + $self->log_verbose("chromosome $chrom_name is hidden\n"); + } + else { + $self->log_warning("No hidden attribute for chromosome $chrom_name\n"); + } } return $visible_chroms; } +=head2 get_wanted_chromosomes + + Arg[1] : B::E::U::ConversionSupport + Arg[2] : B::E::SliceAdaptor + Arg[3] : B::E::AttributeAdaptor + Arg[4] : string $coord_system_name (optional) - 'chromosome' by default + Arg[5] : string $coord_system_version (optional) - 'otter' by default + Example : @chr_names = &Slice::get_wanted_chromosomes($support,$laa,$lsa); + Description : retrieve names of slices from a lutra database that are ready for dumping to Vega. + Deals with list of names to ignore (ignore_chr = LIST) + Return type : arrayref + Caller : general + Status : stable + +=cut + +sub get_wanted_chromosomes { + my $self = shift; + my $aa = shift or throw("You must supply an attribute adaptor"); + my $sa = shift or throw("You must supply a slice adaptor"); + my $cs = shift || 'chromosome'; + my $cv = shift || 'Otter'; + my $export_mode = $self->param('release_type'); + my $release = $self->param('vega_release'); + my $names; + my $chroms = $self->fetch_non_hidden_slices($aa,$sa,$cs,$cv); + CHROM: + foreach my $chrom (@$chroms) { + my $attribs = $aa->fetch_all_by_Slice($chrom); + my $vals = $self->get_attrib_values($attribs,'vega_export_mod'); + if (scalar(@$vals > 1)) { + $self->log_warning ("Multiple attribs for \'vega_export_mod\', please fix before continuing"); + exit; + } + next CHROM if (! grep { $_ eq $export_mode} @$vals); + $vals = $self->get_attrib_values($attribs,'vega_release',$release); + if (scalar(@$vals > 1)) { + $self->log_warning ("Multiple attribs for \'vega_release\' value = $release , please fix before continuing"); + exit; + } + next CHROM if (! grep { $_ eq $release} @$vals); + my $name = $chrom->seq_region_name; + if (my @ignored = $self->param('ignore_chr')) { + next CHROM if (grep {$_ eq $name} @ignored); + } + push @{$names}, $name; + } + return $names; +} + =head2 get_attrib_values @@ -1492,12 +1573,13 @@ sub fetch_non_hidden_slices { Arg[2] : 'code' to search for Arg[3] : 'value' to search for (optional) Example : my $c = $self->get_attrib_values($attribs,'name')); - Description : (i) In the abscence of an attribute value argument examines an arrayref + Description : (i) In the absence of an attribute value argument, examines an arrayref of B::E::Attributes for a particular attribute type, returning the values - for each attribute of that type (can therefore be used to test for the - number of attributes of that type). - (ii) In the presence of the optional value argument, it can be used to test - for the presence of an attribute with a particular value + for each attribute of that type. Can therefore be used to test for the + number of attributes of that type. + (ii) In the presence of the optional value argument it returns all + attributes with that value ie can be used to test for the presence of an + attribute with that particular value. Return type : arrayref of values for that attribute Caller : general Status : stable @@ -1510,7 +1592,7 @@ sub get_attrib_values { my $code = shift; my $value = shift; if (my @atts = grep {$_->code eq $code } @$attribs) { - my $r; + my $r = []; if ($value) { if (my @values = grep {$_->value eq $value} @atts) { foreach (@values) { @@ -1536,17 +1618,17 @@ sub get_attrib_values { =head2 fix_attrib_value - Arg[1] : Arrayref of exisiting B::E::Attributes + Arg[1] : Arrayref of existing B::E::Attributes Arg[2] : dbID of object Arg[3] : name of object (just for reporting) Arg[4] : attrib_type.code Arg[5] : attrib_type.value - Arg[5] : interactive ? (0 by default) - Arg[6] : table + Arg[6] : interactive ? (0 by default) + Arg[7] : table Example : $support->fix_attrib_value($attribs,$chr_id,$chr_name,'vega_export_mod','N',1); Description : adds a new attribute to an object, or updates an existing attribute with a new value Can be run in interactive or non-interactive mode (default) - Return type : none + Return type : arrayref of results Caller : general Status : only ever tested with seq_region_attributes to date @@ -1562,14 +1644,13 @@ sub fix_attrib_value { my $interact = shift || 0; my $table = shift || 'seq_region_attrib'; - #set interactive parameter + #transiently set interactive parameter to zero my $int_before; if (! $interact) { $int_before = $self->param('interactive'); $self->param('interactive',0); } -# warn "interactive_before = $int_before"; #get any existing value(s) for this attribute my $existings = $self->get_attrib_values($attribs,$code); @@ -1589,9 +1670,9 @@ sub fix_attrib_value { exit; } + #...or update an attribute with new values... else { my $existing = $existings->[0]; - #...or update an attribute with new values... if ($existing ne $value) { if ($self->user_proceed("Do you want to reset $name attrib (code = $code) from $existing to $value ?")) { my $r = $self->update_attribute($id,$code,$value); @@ -1659,7 +1740,7 @@ sub store_new_attribute { my $self = shift; my $sr_id = shift; my $attrib_code = shift; - my $attrib_value = shift; + my $attrib_value = shift || ''; my $table = shift || 'seq_region_attrib'; #get database handle diff --git a/modules/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm b/modules/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm index 7dcc52ee31..f1cd29129a 100644 --- a/modules/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm +++ b/modules/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm @@ -23,9 +23,315 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk use strict; use warnings; +no warnings 'uninitialized'; use vars qw(@ISA); use Bio::EnsEMBL::Utils::VegaCuration::Gene; +use Data::Dumper; @ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Gene); + +=head2 find_non_overlaps + + Args : arrayref of B::E::Transcripts + Example : find_non_overlaps($all_transcripts) + Description: identifies any non-overlapping transcripts + Returntype : array refs of stable IDs + Exceptions : none + +=cut + +sub find_non_overlaps { + my $self = shift; + my ($all_transcripts) = @_; + my $non_overlaps = []; + foreach my $transcript1 (@{$all_transcripts}) { + foreach my $transcript2 (@{$all_transcripts}) { + if ($transcript1->end < $transcript2->start) { + push @{$non_overlaps}, $transcript1->stable_id; + push @{$non_overlaps}, $transcript2->stable_id; + } + } + } + return $non_overlaps; +} + +=head2 check_remarks_and_update names + + Arg[1] : B::E::Gene (with potentially duplicated transcript names) + Arg[2] : FH (for storing list of previous seen genes) + Arg[3] : counter 1 (no. of patched genes) + Arg[4] : counter 2 (no. of patched transcripts) + Example : $support->update_names($gene,$fh,\$c1,\$c2) + Description: - checks remarks and patches transcripts with identical names according to + CDS and length if there are no fragmented gene/transcript_remarks + - adds remark attribute to gene + - writes IDs of previously seen genes to file + Returntype : true | false (depending on whether patched or not), counter1, counter2 + +=cut + +sub check_remarks_and_update_names { + my $self = shift; + my ($gene,$k_flist_fh,$gene_c,$trans_c) = @_; + my $action = ($self->param('dry_run')) ? 'Would add' : 'Added'; + my $aa = $gene->adaptor->db->get_AttributeAdaptor; + my $dbh = $gene->adaptor->db->dbc->db_handle; + + #get list of IDs that have previously been sent to annotators + my $seen_genes = $self->get_havana_fragmented_loci_comments; + + my $gsi = $gene->stable_id; + my $gid = $gene->dbID; + my $g_name; + eval { + $g_name = $gene->display_xref->display_id; + }; + if ($@) { + $g_name = $gene->get_all_Attributes('name')->[0]->value; + } + my $gene_remark = 'This locus has been annotated as fragmented because either there is not enough evidence covering the whole locus to identify the exact exon structure of the transcript, or because the transcript spans a gap in the assembly'; + my $attrib = [ + Bio::EnsEMBL::Attribute->new( + -CODE => 'remark', + -NAME => 'Remark', + -DESCRIPTION => 'Annotation remark', + -VALUE => $gene_remark, + ) ]; + + #get existing gene and transcript remarks + my %remarks; + foreach my $type ('remark','hidden_remark') { + $remarks{$type}->{'gene'} = [ map {$_->value} @{$gene->get_all_Attributes($type)} ]; + foreach my $trans (@{$gene->get_all_Transcripts()}) { + my $tsi = $trans->stable_id; + push @{$remarks{$type}->{'transcripts'}}, map {$_->value} @{$trans->get_all_Attributes('remark')}; + } + } + + #if any of the remarks identify this gene as being known by Havana as being fragmented... + if ( (grep {$_ =~ /fragmen/i } @{$remarks{'hidden_remark'}->{'gene'}}, + @{$remarks{'remark'}->{'gene'}}, + @{$remarks{'remark'}->{'transcripts'}}, + @{$remarks{'hidden_remark'}->{'transcripts'}} ) ) { + if (grep { $_ eq $gene_remark} @{$remarks{'remark'}->{'gene'}}) { + $self->log("Fragmented loci annotation remark for gene $gid already exists\n"); + } + #add gene_attrib + else { + if (! $self->param('dry_run') ) { + $aa->store_on_Gene($gid,$attrib); + } + $self->log("$action correctly formatted fragmented loci annotation remark for gene $gsi\n"); + } + return (0,$gene_c,$trans_c); + } + #log if it's been reported before since the gene should have a remark. + elsif ($seen_genes->{$gsi} eq 'fragmented') { + $self->log_warning("PREVIOUS: $action correctly formatted fragmented loci annotation remark for gene $gsi (has previously been OKeyed by Havana as being fragmented but has no Annotation remark, please add one!)\n"); + print $k_flist_fh "$gsi\n"; + #add gene_attrib anyway. + if (! $self->param('dry_run') ) { + $aa->store_on_Gene($gid,$attrib); + } + return (0,$gene_c,$trans_c); + } + + #otherwise patch transcript names according to length and CDS + else { + $gene_c++; + my @trans = $gene->get_all_Transcripts(); + + #separate coding and non_coding transcripts + my $coding_trans = []; + my $noncoding_trans = []; + foreach my $trans ( @{$gene->get_all_Transcripts()} ) { + if ($trans->translate) { + push @$coding_trans, $trans; + } + else { + push @$noncoding_trans, $trans; + } + } + + #sort transcripts coding > non-coding, then on length + my $c = 0; + $self->log("\nPatching names according to CDS and length:\n",1); + foreach my $array_ref ($coding_trans,$noncoding_trans) { + foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) { + $trans_c++; + my $tsi = $trans->stable_id; + my $t_name; + eval { + $t_name = $trans->display_xref->display_id; + }; + if ($@) { + $t_name = $trans->get_all_Attributes('name')->[0]->value; + } + $c++; + my $ext = sprintf("%03d", $c); + my $new_name = $g_name.'-'.$ext; + $self->log(sprintf("%-20s%-3s%-20s", "$t_name ", "-->", "$new_name")."\n",1); + if (! $self->param('dry_run')) { + + # update transcript display xref + $dbh->do(qq(UPDATE xref x, external_db edb + SET x.display_label = "$new_name" + WHERE x.external_db_id = edb.external_db_id + AND x.dbprimary_acc = "$tsi" + AND edb.db_name = "Vega_transcript")); + } + } + } + } + return (1,$gene_c,$trans_c); +} + +=head2 check_names_and_overlap + + Arg[1] : arayref of arrayrefs of duplicated names + Arg[2] : B::E::Gene (with potentially duplicated transcript names) + Arg[3] : FH (to log new duplicates) + Example : $support->check_names_and_overlap($transcripts,$gene,$fh) + Description: checks pairs of transcripts identified as having duplicate Vega names: + - to see if they have identical names in loutre (shouldn't have) + - distinguish between overlapping and non overlapping transcripts + Returntype : none + +=cut + +sub check_names_and_overlap { + my $self = shift; + my ($transcript_info,$gene,$n_flist_fh) = @_; + my $ta = $gene->adaptor->db->get_TranscriptAdaptor; + my $gsi = $gene->stable_id; + my $g_name = $gene->get_all_Attributes('name')->[0]->value; + foreach my $set (values %{$transcript_info} ) { + next if (scalar @{$set} == 1); + my $transcripts = []; + my $all_t_names; + my %ids_to_names; + foreach my $id1 (@{$set}) { + my ($name1,$tsi1) = split /\|/, $id1; + $ids_to_names{$tsi1} = $name1; + $all_t_names .= "$tsi1 [$name1] "; + my $t = $ta->fetch_by_stable_id($tsi1); + push @{$transcripts}, $t; + } + + my $non_overlaps; + eval { + $non_overlaps = $self->find_non_overlaps($transcripts); + }; + if ($@) { + $self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n"); + } + + #if the transcripts don't overlap + elsif (@{$non_overlaps}) { + my $tsi_string; + foreach my $id (@{$non_overlaps}) { + my $string = " $id [ $ids_to_names{$id} ] "; + $tsi_string .= $string; + } + + $self->log_warning("NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'Annotation_remark- fragmented_loci\' on the gene or \'\%fragmen\%\' remark on any transcripts. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n"); + #log gsi (to be sent to Havana) + print $n_flist_fh "$gsi\n"; + } + #...otherwise if the transcripts do overlap + elsif ($self->param('verbose')) { + $self->log_warning("NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with Vega duplicated names and it has no \'Annotation_remark- fragmented_loci\' on the gene or \'\%fragmen\%\' remark on any transcripts. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n"); + print $n_flist_fh "$gsi\n"; + } + } +} + +=head2 get_havana_fragmented_loci_comments + + Args : none + Example : my $results = $support->get_havana_fragmented_loci_comments + Description: parses the HEREDOC containing Havana comments in this module + Returntype : hashref + +=cut + +sub get_havana_fragmented_loci_comments { + my $seen_genes; + while (<DATA>) { + next if /^\s+$/ or /#+/; + my ($obj,$comment) = split /=/; + $obj =~ s/^\s+|\s+$//g; + $comment =~ s/^\s+|\s+$//g; + $seen_genes->{$obj} = $comment; + } + return $seen_genes; +} + + + +#details of genes with duplicated transcript names that have already been reported to Havana +#identified as either fragmented or as being OK to patch +__DATA__ + +OTTMUSG00000005478 = fragmented +OTTMUSG00000001936 = fragmented +OTTMUSG00000017081 = fragmented +OTTMUSG00000011441 = fragmented +OTTMUSG00000013335 = fragmented +OTTMUSG00000011654 = fragmented +OTTMUSG00000001835 = fragmented +OTTHUMG00000035221 = fragmented +OTTHUMG00000037378 = fragmented +OTTHUMG00000060732 = fragmented +OTTHUMG00000132441 = fragmented +OTTHUMG00000031383 = fragmented +OTTHUMG00000012716 = fragmented +OTTHUMG00000031102 = fragmented +OTTHUMG00000148816 = fragmented +OTTHUMG00000149059 = fragmented +OTTHUMG00000149221 = fragmented +OTTHUMG00000149326 = fragmented +OTTHUMG00000149644 = fragmented +OTTHUMG00000149574 = fragmented +OTTHUMG00000058101 = fragmented + +OTTHUMG00000150119 = OK +OTTHUMG00000149850 = OK +OTTHUMG00000058101 = OK + +OTTMUSG00000011654 = fragmented +OTTMUSG00000019369 = fragmented +OTTMUSG00000017081 = fragmented +OTTMUSG00000001835 = fragmented +OTTMUSG00000011499 = fragmented +OTTMUSG00000013335 = fragmented +OTTMUSG00000008023 = fragmented +OTTMUSG00000019369 = fragmented + + +OTTMUSG00000022266 +OTTMUSG00000006697 + + + + + +OTTMUSG00000012302 = +OTTMUSG00000013368 = +OTTMUSG00000015766 = +OTTMUSG00000016025 = +OTTMUSG00000001066 = +OTTMUSG00000016331 = +OTTMUSG00000006935 = +OTTMUSG00000007263 = +OTTMUSG00000000304 = +OTTMUSG00000009150 = +OTTMUSG00000008023 = +OTTMUSG00000017077 = +OTTMUSG00000003440 = +OTTMUSG00000016310 = +OTTMUSG00000026199 = +OTTMUSG00000028423 = +OTTMUSG00000007427 = diff --git a/modules/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm b/modules/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm index 1577eeab26..2c0f51658e 100644 --- a/modules/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm +++ b/modules/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm @@ -24,6 +24,7 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk use strict; use warnings; use vars qw(@ISA); +use Data::Dumper; use Bio::EnsEMBL::Utils::VegaCuration::Transcript; @@ -33,7 +34,8 @@ use Bio::EnsEMBL::Utils::VegaCuration::Transcript; Args : B::E::Transcript Example : my $results = $support->check_CDS_end_remarks($transcript) - Description: identifies incorrect 'CDS end...' transcript remarks + Description: identifies incorrect 'CDS end...' transcript remarks in a + otter-derived Vega database Returntype : hashref =cut @@ -51,7 +53,7 @@ sub check_CDS_start_end_remarks { my $stop_codon = substr($trans_seq, $coding_end-3, 3); my $start_codon = substr($trans_seq, $coding_start-1, 3); - #hasref to return results + #hashref to return results my $results; #extra CDS end not found remarks @@ -66,10 +68,10 @@ sub check_CDS_start_end_remarks { if ( $coding_end == $trans_end ) { if (! grep {$_->value eq 'CDS end not found'} @remarks) { if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) { - $results->{'END_MISSING_1'} = 1; + $results->{'END_MISSING_2'} = 1; } else { - $results->{'END_MISSING_2'} = $stop_codon; + $results->{'END_MISSING_1'} = $stop_codon; } } } @@ -87,12 +89,130 @@ sub check_CDS_start_end_remarks { if ( $coding_start == 1) { if ( ! grep {$_->value eq 'CDS start not found'} @remarks) { if ($start_codon eq 'ATG') { - $results->{'START_MISSING_1'} = 1; + $results->{'START_MISSING_2'} = 1; } else { - $results->{'START_MISSING_2'} = $start_codon; + $results->{'START_MISSING_1'} = $start_codon; } } } return $results; } + +=head2 check_CDS_end_remarks_loutre + + Args : B::E::Transcript + Example : my $results = $support->check_CDS_end_remarks($transcript) + Description: identifies incorrect 'CDS end...' transcript attribs in a loutre + of a loutre-derived Vega database. + Returntype : hashref + +=cut + +sub check_CDS_start_end_remarks_loutre { + my $self = shift; + my $trans = shift; + + # info for checking + my @stops = qw(TGA TAA TAG); + my %attributes; + foreach my $attribute (@{$trans->get_all_Attributes()}) { + $attributes{$attribute->code} = $attribute; + } + my $coding_end = $trans->cdna_coding_end; + my $coding_start = $trans->cdna_coding_start; + my $trans_end = $trans->length; + my $trans_seq = $trans->seq->seq; + my $stop_codon = substr($trans_seq, $coding_end-3, 3); + my $start_codon = substr($trans_seq, $coding_start-1, 3); + + #hashref to return results + my $results; + + #extra CDS end not found remarks + if ( ($attributes{'cds_end_NF'}->value == 1) + && ($coding_end != $trans_end) + && ( grep {$_ eq $stop_codon} @stops) ) { + $results->{'END_EXTRA'} = 1; + } + #missing CDS end not found remark + if ( $coding_end == $trans_end ) { + if ($attributes{'cds_end_NF'}->value == 0 ) { + if (grep {$_ eq $stop_codon} @stops) { + $results->{'END_MISSING_2'} = 1; + } + else { + $results->{'END_MISSING_1'} = $stop_codon; + } + } + } + #extra CDS start not found remark + if ( ($attributes{'cds_start_NF'}->value == 1 ) + && ($coding_start != 1) + && ($start_codon eq 'ATG') ) { + $results->{'START_EXTRA'} = 1; + } + #missing CDS start not found remark + if ( $coding_start == 1) { + if ( $attributes{'cds_start_NF'}->value == 0 ) { + if ($start_codon eq 'ATG') { + $results->{'START_MISSING_2'} = 1; + } else { + $results->{'START_MISSING_1'} = $start_codon; + } + } + } + return $results; +} + +=head2 get_havana_seleno_comments + + Args : none + Example : my $results = $support->get_havana_seleno_comments + Description: parses the HEREDOC containing Havana comments in this module + Returntype : hashref + +=cut + +sub get_havana_seleno_comments { + my $seen_translations; + while (<DATA>) { + next if /^\s+$/ or /#+/; + my ($obj,$comment) = split /=/; + $obj =~ s/^\s+|\s+$//g; + $comment =~ s/^\s+|\s+$//g; + $seen_translations->{$obj} = $comment; + } + return $seen_translations; +} + + +#details of annotators comments +__DATA__ + +OTTHUMT00000144659 = FIXED- changed to transcript +OTTHUMT00000276377 = FIXED- changed to transcript +OTTHUMT00000257741 = FIXED- changed to nmd +OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed +OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed +OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene +OTTHUMT00000285227 = FIXED- changed start site +OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon +OTTHUMT00000157999 = FIXED- changed incorrect stop +OTTHUMT00000150523 = FIXED- incorrect trimming of CDS +OTTHUMT00000150525 = FIXED- incorrect trimming of CDS +OTTHUMT00000150522 = FIXED- incorrect trimming of CDS +OTTHUMT00000150521 = FIXED- incorrect trimming of CDS +OTTHUMT00000246819 = FIXED- corrected frame +OTTHUMT00000314078 = FIXED- corrected frame +OTTHUMT00000080133 = FIXED- corrected frame +OTTHUMT00000286423 = FIXED- changed to transcript +OTTMUST00000055509 = FIXED- error +OTTMUST00000038729 = FIXED- corrected frame +OTTMUST00000021760 = FIXED- corrected frame +OTTMUST00000023057 = FIXED- corrected frame +OTTMUST00000015207 = FIXED- corrected frame +OTTMUST00000056646 = FIXED- error +OTTMUST00000059686 = FIXED- corrected frame +OTTMUST00000013426 = FIXED- corrected frame +OTTMUST00000044412 = FIXED- error -- GitLab