Commit ef7f595f authored by Arnaud Kerhornou's avatar Arnaud Kerhornou
Browse files

Gramene various ontologies parser

parent 40b8ea75
package XrefParser::GrameneOntologyParser;
=pod
=head1 NAME
XrefParser::GrameneOntologyParser
=head1 DESCRIPTION
Parse ontology files. File format (and example data):
1: DB, database contributing the file
2: DB_Object_ID (gene stable id)
3: DB_Object_Symbol, see below
4: Qualifier (optional), one or more of 'NOT', 'contributes_to',
'colocalizes_with' as qualifier(s) for a GO annotation, when needed,
multiples separated by pipe (|)
5: Ontology accession, e.g., "PO:0007633"
6: DB:Reference(|DB:Reference), the reference associated with the GO
annotation
7: Evidence, the evidence code for the GO annotation
8: With (or) From (optional), any With or From qualifier for the GO
annotation
9: Aspect, which ontology the GO term belongs (Function, Process or
Component)
10: DB_Object_Name(|Name) (optional), a name for the gene product in
words, e.g. 'acid phosphatase'
11: DB_Object_Synonym(|Synonym) (optional), see below
12: DB_Object_Type, type of object annotated, e.g. gene, protein, etc.
13: taxon(|taxon), taxonomic identifier of species encoding gene
product
14: Date, date GO annotation was made in the format
15: Assigned_by, source of the annotation (either "TAIR" or "TIGR")
TAIR
locus:2007106
EXPA1
PO:0000293
TAIR:Publication:501680952
NAS
S
AT1G69530
AT1G69530|ATEXPA1|EXP1|AT-EXP1|ATEXP1|ATHEXP ALPHA 1.2|EXPA1|expansin A1|EXPANSIN 1|F10D13.18|F10D13_18
protein
taxon:3702
20040428
TAIR
TAIR:locus:2007106
=head1 AUTHOR
Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.
=cut
use strict;
use POSIX qw(strftime);
use File::Basename;
use base qw( XrefParser::BaseParser );
# Creates ontology xrefs from a GAF annotation file.
# Also requires an OBO file mapping IDs to terms.
# The ontology xrefs are dependent on xrefs
#----------------------------------------------------------------------
sub run {
my ($self, $args) = @_;
my $source_id = $args->{'source_id'};
my $species_id = $args->{'species_id'};
my $files = $args->{'files'};
my $release_file = $args->{'rel_file'};
my $notify = sub { print @_ if $args->{'verbose'} };
my $file = ref $files eq 'ARRAY' ? shift @$files : '';
if ( $file ) {
$notify->(sprintf "%s parsing file '$file'\n", __PACKAGE__);
}
else {
printf STDERR "%s called without a 'files' argument\n%s",
__PACKAGE__, Dumper($args);
return 1; # error
}
# get the 'PO' or 'GO' source_name
my $source_name = $self->get_source_name_for_source_id($source_id);
# get the "main" GO/PO source id.
$source_id = $self->get_source_id_for_source_name( $source_name, 'main' );
$species_id ||= $self->get_species_id_for_filename($file);
my %species_id2name = $self->species_id2name();
my $species = $species_id2name{$species_id}->[0];
$notify->("Parsing $source_name for $species\n");
# These Ontology xrefs depend on gene ID xrefs (TAIR_LOCUS) having been
# previously loaded - get the source.
my $gene_source_id = $self->get_source_id_for_source_name('TAIR_LOCUS');
#
# Get mappimg from GO terms to descriptions from the obo file.
#
# The gene_association file in GAF format
# http://www.geneontology.org/GO.format.gaf-2_0.shtml
# The obo file (xml or plain)
my %id_to_term;
if ( my $file_desc = shift @$files ) {
my $obo_io = $self->get_filehandle($file_desc);
if ( !defined $obo_io ) {
print STDERR "ERROR: Could not open $file_desc\n";
return 1; # 1 error
}
my $term = undef;
my $desc = undef;
if ( $file_desc =~ /xml/ ) { # xml file format
while ( $_ = $obo_io->getline() ) {
if (/\<id\>([G|P]O:\d+)\<\/id\>/) {
$term = $1;
}
elsif (/\<name\>(.*)\<\/name\>/) {
if ( defined($term) ) {
$id_to_term{$term} = $1;
}
$term = undef;
}
}
}
else {
while ( $_ = $obo_io->getline() ) {
if (/^id:\s+([G|P]O:\d+)/) {
$term = $1;
}
elsif (/^name:\s+(.*)/) {
if ( defined($term) ) {
$id_to_term{$term} = $1;
}
$term = undef;
}
}
}
$obo_io->close();
$notify->(
sprintf "Mapped %s terms to descriptions\n",
scalar keys %id_to_term
);
}
my %wrongtype;
# Get a list of taxon IDs given that some species have multiple e.g. strains
my %sp2tax = $self->species_id2taxonomy();
my @tax_ids = @{ $sp2tax{ $species_id } };
# Loop for each taxon
my $count = 0;
my $miss_parse = 0;
my $miss_get_xref = 0;
for my $tax_id ( @tax_ids ) {
$notify->("processing for taxon: $tax_id\n");
my $taxon_line = "taxon:" . $tax_id;
# Open the gene association GAF file
my $gaf_io = $self->get_filehandle( $file );
if ( !defined $gaf_io ) {
print STDERR "ERROR: Could not open $file\n";
return 1; # error
}
while ( my $line = $gaf_io->getline() ) {
next if $line =~ /^!/; # comment
# next unless /$taxon_line/; # only process data for this taxon
chomp;
my (
$DB, $DB_Object_ID,
$DB_Object_Symbol, $Qualifier,
$GO_ID, $DB_Reference,
$Evidence_Code, $With,
$Aspect, $DB_Object_Name,
$DB_Object_Synonym, $DB_Object_Type,
$Taxon, $Date,
$Assigned_By, $Annotation_Extension,
$Gene_Product_Form_ID, @extra
) = split( /\t/, $line );
# next unless $Taxon eq $taxon_line; # Only process this species
next if $Qualifier eq "NOT"; # Skip "NOT" terms entirely
$DB_Object_Name =~ s/\'/\\\'/g;
my $master = 0;
# in po file stable_id=$array[9] # in GO asso file, stable_id=
# ??[9] [10]
# Find the stable_id in either DB_Object_Name or DB_Object_Synonym
if ( my $stable_id = $DB_Object_Name || $DB_Object_Synonym ) {
my $xref_id = $self->add_xref({
acc => $stable_id,
label => '',
desc => $nasc_gene_id,
source_id => $nascg_source_id,
species_id => $species_id,
info_type => 'DIRECT',
});
$self->add_direct_xref(
$tairg_xref_id, $gene_stable_id, 'Gene', 'DIRECT'
);
$count++;
}
$gaf_io->close();
}
$notify->(
map { " - $_\n" }
"$count $source_name dependent xrefs added",
"$miss_parse lines did not contain a recognised stable_id",
"$miss_get_xref lines contained a stable_id that did not " .
"correspond with a known xref",
);
if ( defined $release_file ) {
# Parse and set release information from $release_file.
my $release_io = $self->get_filehandle($release_file);
# Slurp mode.
local $/;
my $release = <$release_io>;
$release_io->close();
$release =~ tr/\n/ /;
$release =~ s#.*The following table describes.*?of (POC.*?)<ul>.*#$1#;
$release =~ s#<[^>]+>##g;
$notify->("$source_name release: '$release'\n");
$self->set_release( $source_id, $release );
}
return 0; # success
}
1;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment