Commit cb77baa6 authored by Steve Trevanion's avatar Steve Trevanion
Browse files

merge from vega-46-dev

parent d0a18ea7
...@@ -8,7 +8,7 @@ schema conversion scripts ...@@ -8,7 +8,7 @@ schema conversion scripts
=head1 SYNOPSIS =head1 SYNOPSIS
my $serverroot = '/path/to/ensembl'; my $serverroot = '/path/to/ensembl';
my $suport = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot); my $support = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot);
# parse common options # parse common options
$support->parse_common_options; $support->parse_common_options;
...@@ -34,7 +34,8 @@ Please see http://www.ensembl.org/code_licence.html for details ...@@ -34,7 +34,8 @@ Please see http://www.ensembl.org/code_licence.html for details
=head1 AUTHOR =head1 AUTHOR
Patrick Meidl <pm2@sanger.ac.uk> Steve Trevanion <st3@sanger.ac.uk
Patrick Meidl <meidl@ebi.ac.uk>
=head1 CONTACT =head1 CONTACT
...@@ -235,9 +236,9 @@ sub get_common_params { ...@@ -235,9 +236,9 @@ sub get_common_params {
); );
} }
=head2 get_lutre_params =head2 get_loutre_params
Example : my @allowed_params = $self->get_lutre_params, 'extra_param'; Example : my @allowed_params = $self->get_loutre_params, 'extra_param';
Description : Returns a list of commonly used parameters in for working with a loutre db Description : Returns a list of commonly used parameters in for working with a loutre db
Return type : Array - list of common parameters Return type : Array - list of common parameters
Exceptions : none Exceptions : none
...@@ -255,6 +256,23 @@ sub get_loutre_params { ...@@ -255,6 +256,23 @@ sub get_loutre_params {
); );
} }
=head2 remove_vega_params
Example : $support->remove_vega_params;
Description : Removes Vega db conection parameters. Usefull to avoid clutter in log files when
working exclusively with loutre
Return type : none
Exceptions : none
Caller : general
=cut
sub remove_vega_params {
my $self = shift;
foreach my $param (qw(dbname host port user pass)) {
$self->{'_param'}{$param} = undef;
}
}
=head2 confirm_params =head2 confirm_params
...@@ -298,14 +316,14 @@ sub confirm_params { ...@@ -298,14 +316,14 @@ sub confirm_params {
sub list_all_params { sub list_all_params {
my $self = shift; my $self = shift;
my $txt = sprintf " %-20s%-40s\n", qw(PARAMETER VALUE); my $txt = sprintf " %-21s%-40s\n", qw(PARAMETER VALUE);
$txt .= " " . "-"x70 . "\n"; $txt .= " " . "-"x71 . "\n";
$Text::Wrap::colums = 72; $Text::Wrap::colums = 72;
my @params = $self->allowed_params; my @params = $self->allowed_params;
foreach my $key (@params) { foreach my $key (@params) {
my @vals = $self->param($key); my @vals = $self->param($key);
if (@vals) { if (@vals) {
$txt .= Text::Wrap::wrap( sprintf(' %-20s', $key), $txt .= Text::Wrap::wrap( sprintf(' %-21s', $key),
' 'x24, ' 'x24,
join(", ", @vals) join(", ", @vals)
) . "\n"; ) . "\n";
...@@ -821,9 +839,10 @@ sub dynamic_use { ...@@ -821,9 +839,10 @@ sub dynamic_use {
my ($self, $classname) = @_; my ($self, $classname) = @_;
my ($parent_namespace, $module) = $classname =~/^(.*::)(.*)$/ ? my ($parent_namespace, $module) = $classname =~/^(.*::)(.*)$/ ?
($1,$2) : ('::', $classname); ($1,$2) : ('::', $classname);
no strict 'refs'; no strict 'refs';
# return if module has already been imported # return if module has already been imported
return 1 if $parent_namespace->{$module.'::'}; return 1 if $parent_namespace->{$module.'::'} && %{ $parent_namespace->{$module.'::'}||{} };
eval "require $classname"; eval "require $classname";
throw("Failed to require $classname: $@") if ($@); throw("Failed to require $classname: $@") if ($@);
...@@ -1464,7 +1483,7 @@ sub commify { ...@@ -1464,7 +1483,7 @@ sub commify {
Arg[3] : string $coord_system_name (optional) - 'chromosome' by default Arg[3] : string $coord_system_name (optional) - 'chromosome' by default
Arg[4] : string $coord_system_version (optional) - 'otter' by default Arg[4] : string $coord_system_version (optional) - 'otter' by default
Example : $chroms = $support->fetch_non_hidden_slice($sa); Example : $chroms = $support->fetch_non_hidden_slice($sa);
Description : retrieve all slices from a lutra database that don't have a hidden attribute Description : retrieve all slices from a loutra database that don't have a hidden attribute
Return type : arrayref Return type : arrayref
Caller : general Caller : general
Status : stable Status : stable
...@@ -1479,12 +1498,74 @@ sub fetch_non_hidden_slices { ...@@ -1479,12 +1498,74 @@ sub fetch_non_hidden_slices {
my $cv = shift || 'Otter'; my $cv = shift || 'Otter';
my $visible_chroms; my $visible_chroms;
foreach my $chrom ( @{$sa->fetch_all($cs,$cv)} ) { foreach my $chrom ( @{$sa->fetch_all($cs,$cv)} ) {
my $attribs = $aa->fetch_all_by_Slice($chrom); my $chrom_name = $chrom->name;
push @$visible_chroms, $chrom if @{$self->get_attrib_values($attribs,'hidden',0)}; my $attribs = $aa->fetch_all_by_Slice($chrom,'hidden');
if ( scalar(@$attribs) > 1 ) {
$self->log_warning("More than one hidden attribute for chromosome $chrom_name\n");
}
elsif ($attribs->[0]->value == 0) {
push @$visible_chroms, $chrom;
}
elsif ($attribs->[0]->value == 1) {
$self->log_verbose("chromosome $chrom_name is hidden\n");
}
else {
$self->log_warning("No hidden attribute for chromosome $chrom_name\n");
}
} }
return $visible_chroms; return $visible_chroms;
} }
=head2 get_wanted_chromosomes
Arg[1] : B::E::U::ConversionSupport
Arg[2] : B::E::SliceAdaptor
Arg[3] : B::E::AttributeAdaptor
Arg[4] : string $coord_system_name (optional) - 'chromosome' by default
Arg[5] : string $coord_system_version (optional) - 'otter' by default
Example : @chr_names = &Slice::get_wanted_chromosomes($support,$laa,$lsa);
Description : retrieve names of slices from a lutra database that are ready for dumping to Vega.
Deals with list of names to ignore (ignore_chr = LIST)
Return type : arrayref
Caller : general
Status : stable
=cut
sub get_wanted_chromosomes {
my $self = shift;
my $aa = shift or throw("You must supply an attribute adaptor");
my $sa = shift or throw("You must supply a slice adaptor");
my $cs = shift || 'chromosome';
my $cv = shift || 'Otter';
my $export_mode = $self->param('release_type');
my $release = $self->param('vega_release');
my $names;
my $chroms = $self->fetch_non_hidden_slices($aa,$sa,$cs,$cv);
CHROM:
foreach my $chrom (@$chroms) {
my $attribs = $aa->fetch_all_by_Slice($chrom);
my $vals = $self->get_attrib_values($attribs,'vega_export_mod');
if (scalar(@$vals > 1)) {
$self->log_warning ("Multiple attribs for \'vega_export_mod\', please fix before continuing");
exit;
}
next CHROM if (! grep { $_ eq $export_mode} @$vals);
$vals = $self->get_attrib_values($attribs,'vega_release',$release);
if (scalar(@$vals > 1)) {
$self->log_warning ("Multiple attribs for \'vega_release\' value = $release , please fix before continuing");
exit;
}
next CHROM if (! grep { $_ eq $release} @$vals);
my $name = $chrom->seq_region_name;
if (my @ignored = $self->param('ignore_chr')) {
next CHROM if (grep {$_ eq $name} @ignored);
}
push @{$names}, $name;
}
return $names;
}
=head2 get_attrib_values =head2 get_attrib_values
...@@ -1492,12 +1573,13 @@ sub fetch_non_hidden_slices { ...@@ -1492,12 +1573,13 @@ sub fetch_non_hidden_slices {
Arg[2] : 'code' to search for Arg[2] : 'code' to search for
Arg[3] : 'value' to search for (optional) Arg[3] : 'value' to search for (optional)
Example : my $c = $self->get_attrib_values($attribs,'name')); Example : my $c = $self->get_attrib_values($attribs,'name'));
Description : (i) In the abscence of an attribute value argument examines an arrayref Description : (i) In the absence of an attribute value argument, examines an arrayref
of B::E::Attributes for a particular attribute type, returning the values of B::E::Attributes for a particular attribute type, returning the values
for each attribute of that type (can therefore be used to test for the for each attribute of that type. Can therefore be used to test for the
number of attributes of that type). number of attributes of that type.
(ii) In the presence of the optional value argument, it can be used to test (ii) In the presence of the optional value argument it returns all
for the presence of an attribute with a particular value attributes with that value ie can be used to test for the presence of an
attribute with that particular value.
Return type : arrayref of values for that attribute Return type : arrayref of values for that attribute
Caller : general Caller : general
Status : stable Status : stable
...@@ -1510,7 +1592,7 @@ sub get_attrib_values { ...@@ -1510,7 +1592,7 @@ sub get_attrib_values {
my $code = shift; my $code = shift;
my $value = shift; my $value = shift;
if (my @atts = grep {$_->code eq $code } @$attribs) { if (my @atts = grep {$_->code eq $code } @$attribs) {
my $r; my $r = [];
if ($value) { if ($value) {
if (my @values = grep {$_->value eq $value} @atts) { if (my @values = grep {$_->value eq $value} @atts) {
foreach (@values) { foreach (@values) {
...@@ -1536,17 +1618,17 @@ sub get_attrib_values { ...@@ -1536,17 +1618,17 @@ sub get_attrib_values {
=head2 fix_attrib_value =head2 fix_attrib_value
Arg[1] : Arrayref of exisiting B::E::Attributes Arg[1] : Arrayref of existing B::E::Attributes
Arg[2] : dbID of object Arg[2] : dbID of object
Arg[3] : name of object (just for reporting) Arg[3] : name of object (just for reporting)
Arg[4] : attrib_type.code Arg[4] : attrib_type.code
Arg[5] : attrib_type.value Arg[5] : attrib_type.value
Arg[5] : interactive ? (0 by default) Arg[6] : interactive ? (0 by default)
Arg[6] : table Arg[7] : table
Example : $support->fix_attrib_value($attribs,$chr_id,$chr_name,'vega_export_mod','N',1); Example : $support->fix_attrib_value($attribs,$chr_id,$chr_name,'vega_export_mod','N',1);
Description : adds a new attribute to an object, or updates an existing attribute with a new value Description : adds a new attribute to an object, or updates an existing attribute with a new value
Can be run in interactive or non-interactive mode (default) Can be run in interactive or non-interactive mode (default)
Return type : none Return type : arrayref of results
Caller : general Caller : general
Status : only ever tested with seq_region_attributes to date Status : only ever tested with seq_region_attributes to date
...@@ -1562,14 +1644,13 @@ sub fix_attrib_value { ...@@ -1562,14 +1644,13 @@ sub fix_attrib_value {
my $interact = shift || 0; my $interact = shift || 0;
my $table = shift || 'seq_region_attrib'; my $table = shift || 'seq_region_attrib';
#set interactive parameter #transiently set interactive parameter to zero
my $int_before; my $int_before;
if (! $interact) { if (! $interact) {
$int_before = $self->param('interactive'); $int_before = $self->param('interactive');
$self->param('interactive',0); $self->param('interactive',0);
} }
# warn "interactive_before = $int_before";
#get any existing value(s) for this attribute #get any existing value(s) for this attribute
my $existings = $self->get_attrib_values($attribs,$code); my $existings = $self->get_attrib_values($attribs,$code);
...@@ -1589,9 +1670,9 @@ sub fix_attrib_value { ...@@ -1589,9 +1670,9 @@ sub fix_attrib_value {
exit; exit;
} }
#...or update an attribute with new values...
else { else {
my $existing = $existings->[0]; my $existing = $existings->[0];
#...or update an attribute with new values...
if ($existing ne $value) { if ($existing ne $value) {
if ($self->user_proceed("Do you want to reset $name attrib (code = $code) from $existing to $value ?")) { if ($self->user_proceed("Do you want to reset $name attrib (code = $code) from $existing to $value ?")) {
my $r = $self->update_attribute($id,$code,$value); my $r = $self->update_attribute($id,$code,$value);
...@@ -1659,7 +1740,7 @@ sub store_new_attribute { ...@@ -1659,7 +1740,7 @@ sub store_new_attribute {
my $self = shift; my $self = shift;
my $sr_id = shift; my $sr_id = shift;
my $attrib_code = shift; my $attrib_code = shift;
my $attrib_value = shift; my $attrib_value = shift || '';
my $table = shift || 'seq_region_attrib'; my $table = shift || 'seq_region_attrib';
#get database handle #get database handle
......
...@@ -23,9 +23,315 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk ...@@ -23,9 +23,315 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk
use strict; use strict;
use warnings; use warnings;
no warnings 'uninitialized';
use vars qw(@ISA); use vars qw(@ISA);
use Bio::EnsEMBL::Utils::VegaCuration::Gene; use Bio::EnsEMBL::Utils::VegaCuration::Gene;
use Data::Dumper;
@ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Gene); @ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Gene);
=head2 find_non_overlaps
Args : arrayref of B::E::Transcripts
Example : find_non_overlaps($all_transcripts)
Description: identifies any non-overlapping transcripts
Returntype : array refs of stable IDs
Exceptions : none
=cut
sub find_non_overlaps {
my $self = shift;
my ($all_transcripts) = @_;
my $non_overlaps = [];
foreach my $transcript1 (@{$all_transcripts}) {
foreach my $transcript2 (@{$all_transcripts}) {
if ($transcript1->end < $transcript2->start) {
push @{$non_overlaps}, $transcript1->stable_id;
push @{$non_overlaps}, $transcript2->stable_id;
}
}
}
return $non_overlaps;
}
=head2 check_remarks_and_update names
Arg[1] : B::E::Gene (with potentially duplicated transcript names)
Arg[2] : FH (for storing list of previous seen genes)
Arg[3] : counter 1 (no. of patched genes)
Arg[4] : counter 2 (no. of patched transcripts)
Example : $support->update_names($gene,$fh,\$c1,\$c2)
Description: - checks remarks and patches transcripts with identical names according to
CDS and length if there are no fragmented gene/transcript_remarks
- adds remark attribute to gene
- writes IDs of previously seen genes to file
Returntype : true | false (depending on whether patched or not), counter1, counter2
=cut
sub check_remarks_and_update_names {
my $self = shift;
my ($gene,$k_flist_fh,$gene_c,$trans_c) = @_;
my $action = ($self->param('dry_run')) ? 'Would add' : 'Added';
my $aa = $gene->adaptor->db->get_AttributeAdaptor;
my $dbh = $gene->adaptor->db->dbc->db_handle;
#get list of IDs that have previously been sent to annotators
my $seen_genes = $self->get_havana_fragmented_loci_comments;
my $gsi = $gene->stable_id;
my $gid = $gene->dbID;
my $g_name;
eval {
$g_name = $gene->display_xref->display_id;
};
if ($@) {
$g_name = $gene->get_all_Attributes('name')->[0]->value;
}
my $gene_remark = 'This locus has been annotated as fragmented because either there is not enough evidence covering the whole locus to identify the exact exon structure of the transcript, or because the transcript spans a gap in the assembly';
my $attrib = [
Bio::EnsEMBL::Attribute->new(
-CODE => 'remark',
-NAME => 'Remark',
-DESCRIPTION => 'Annotation remark',
-VALUE => $gene_remark,
) ];
#get existing gene and transcript remarks
my %remarks;
foreach my $type ('remark','hidden_remark') {
$remarks{$type}->{'gene'} = [ map {$_->value} @{$gene->get_all_Attributes($type)} ];
foreach my $trans (@{$gene->get_all_Transcripts()}) {
my $tsi = $trans->stable_id;
push @{$remarks{$type}->{'transcripts'}}, map {$_->value} @{$trans->get_all_Attributes('remark')};
}
}
#if any of the remarks identify this gene as being known by Havana as being fragmented...
if ( (grep {$_ =~ /fragmen/i } @{$remarks{'hidden_remark'}->{'gene'}},
@{$remarks{'remark'}->{'gene'}},
@{$remarks{'remark'}->{'transcripts'}},
@{$remarks{'hidden_remark'}->{'transcripts'}} ) ) {
if (grep { $_ eq $gene_remark} @{$remarks{'remark'}->{'gene'}}) {
$self->log("Fragmented loci annotation remark for gene $gid already exists\n");
}
#add gene_attrib
else {
if (! $self->param('dry_run') ) {
$aa->store_on_Gene($gid,$attrib);
}
$self->log("$action correctly formatted fragmented loci annotation remark for gene $gsi\n");
}
return (0,$gene_c,$trans_c);
}
#log if it's been reported before since the gene should have a remark.
elsif ($seen_genes->{$gsi} eq 'fragmented') {
$self->log_warning("PREVIOUS: $action correctly formatted fragmented loci annotation remark for gene $gsi (has previously been OKeyed by Havana as being fragmented but has no Annotation remark, please add one!)\n");
print $k_flist_fh "$gsi\n";
#add gene_attrib anyway.
if (! $self->param('dry_run') ) {
$aa->store_on_Gene($gid,$attrib);
}
return (0,$gene_c,$trans_c);
}
#otherwise patch transcript names according to length and CDS
else {
$gene_c++;
my @trans = $gene->get_all_Transcripts();
#separate coding and non_coding transcripts
my $coding_trans = [];
my $noncoding_trans = [];
foreach my $trans ( @{$gene->get_all_Transcripts()} ) {
if ($trans->translate) {
push @$coding_trans, $trans;
}
else {
push @$noncoding_trans, $trans;
}
}
#sort transcripts coding > non-coding, then on length
my $c = 0;
$self->log("\nPatching names according to CDS and length:\n",1);
foreach my $array_ref ($coding_trans,$noncoding_trans) {
foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) {
$trans_c++;
my $tsi = $trans->stable_id;
my $t_name;
eval {
$t_name = $trans->display_xref->display_id;
};
if ($@) {
$t_name = $trans->get_all_Attributes('name')->[0]->value;
}
$c++;
my $ext = sprintf("%03d", $c);
my $new_name = $g_name.'-'.$ext;
$self->log(sprintf("%-20s%-3s%-20s", "$t_name ", "-->", "$new_name")."\n",1);
if (! $self->param('dry_run')) {
# update transcript display xref
$dbh->do(qq(UPDATE xref x, external_db edb
SET x.display_label = "$new_name"
WHERE x.external_db_id = edb.external_db_id
AND x.dbprimary_acc = "$tsi"
AND edb.db_name = "Vega_transcript"));
}
}
}
}
return (1,$gene_c,$trans_c);
}
=head2 check_names_and_overlap
Arg[1] : arayref of arrayrefs of duplicated names
Arg[2] : B::E::Gene (with potentially duplicated transcript names)
Arg[3] : FH (to log new duplicates)
Example : $support->check_names_and_overlap($transcripts,$gene,$fh)
Description: checks pairs of transcripts identified as having duplicate Vega names:
- to see if they have identical names in loutre (shouldn't have)
- distinguish between overlapping and non overlapping transcripts
Returntype : none
=cut
sub check_names_and_overlap {
my $self = shift;
my ($transcript_info,$gene,$n_flist_fh) = @_;
my $ta = $gene->adaptor->db->get_TranscriptAdaptor;
my $gsi = $gene->stable_id;
my $g_name = $gene->get_all_Attributes('name')->[0]->value;
foreach my $set (values %{$transcript_info} ) {
next if (scalar @{$set} == 1);
my $transcripts = [];
my $all_t_names;
my %ids_to_names;
foreach my $id1 (@{$set}) {
my ($name1,$tsi1) = split /\|/, $id1;
$ids_to_names{$tsi1} = $name1;
$all_t_names .= "$tsi1 [$name1] ";
my $t = $ta->fetch_by_stable_id($tsi1);
push @{$transcripts}, $t;
}
my $non_overlaps;
eval {
$non_overlaps = $self->find_non_overlaps($transcripts);
};
if ($@) {
$self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n");
}
#if the transcripts don't overlap
elsif (@{$non_overlaps}) {
my $tsi_string;
foreach my $id (@{$non_overlaps}) {
my $string = " $id [ $ids_to_names{$id} ] ";
$tsi_string .= $string;
}
$self->log_warning("NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'Annotation_remark- fragmented_loci\' on the gene or \'\%fragmen\%\' remark on any transcripts. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n");
#log gsi (to be sent to Havana)
print $n_flist_fh "$gsi\n";
}
#...otherwise if the transcripts do overlap
elsif ($self->param('verbose')) {
$self->log_warning("NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with Vega duplicated names and it has no \'Annotation_remark- fragmented_loci\' on the gene or \'\%fragmen\%\' remark on any transcripts. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n");
print $n_flist_fh "$gsi\n";
}
}
}
=head2 get_havana_fragmented_loci_comments
Args : none
Example : my $results = $support->get_havana_fragmented_loci_comments
Description: parses the HEREDOC containing Havana comments in this module
Returntype : hashref
=cut
sub get_havana_fragmented_loci_comments {
my $seen_genes;
while (<DATA>) {
next if /^\s+$/ or /#+/;
my ($obj,$comment) = split /=/;
$obj =~ s/^\s+|\s+$//g;
$comment =~ s/^\s+|\s+$//g;
$seen_genes->{$obj} = $comment;
}
return $seen_genes;
}
#details of genes with duplicated transcript names that have already been reported to Havana
#identified as either fragmented or as being OK to patch
__DATA__
OTTMUSG00000005478 = fragmented