Skip to content
Snippets Groups Projects
Commit ef7f595f authored by Arnaud Kerhornou's avatar Arnaud Kerhornou
Browse files

Gramene various ontologies parser

parent 40b8ea75
No related branches found
No related tags found
No related merge requests found
package XrefParser::GrameneOntologyParser;
=pod
=head1 NAME
XrefParser::GrameneOntologyParser
=head1 DESCRIPTION
Parse ontology files. File format (and example data):
1: DB, database contributing the file
2: DB_Object_ID (gene stable id)
3: DB_Object_Symbol, see below
4: Qualifier (optional), one or more of 'NOT', 'contributes_to',
'colocalizes_with' as qualifier(s) for a GO annotation, when needed,
multiples separated by pipe (|)
5: Ontology accession, e.g., "PO:0007633"
6: DB:Reference(|DB:Reference), the reference associated with the GO
annotation
7: Evidence, the evidence code for the GO annotation
8: With (or) From (optional), any With or From qualifier for the GO
annotation
9: Aspect, which ontology the GO term belongs (Function, Process or
Component)
10: DB_Object_Name(|Name) (optional), a name for the gene product in
words, e.g. 'acid phosphatase'
11: DB_Object_Synonym(|Synonym) (optional), see below
12: DB_Object_Type, type of object annotated, e.g. gene, protein, etc.
13: taxon(|taxon), taxonomic identifier of species encoding gene
product
14: Date, date GO annotation was made in the format
15: Assigned_by, source of the annotation (either "TAIR" or "TIGR")
TAIR
locus:2007106
EXPA1
PO:0000293
TAIR:Publication:501680952
NAS
S
AT1G69530
AT1G69530|ATEXPA1|EXP1|AT-EXP1|ATEXP1|ATHEXP ALPHA 1.2|EXPA1|expansin A1|EXPANSIN 1|F10D13.18|F10D13_18
protein
taxon:3702
20040428
TAIR
TAIR:locus:2007106
=head1 AUTHOR
Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.
=cut
use strict;
use POSIX qw(strftime);
use File::Basename;
use base qw( XrefParser::BaseParser );
# Creates ontology xrefs from a GAF annotation file.
# Also requires an OBO file mapping IDs to terms.
# The ontology xrefs are dependent on xrefs
#----------------------------------------------------------------------
sub run {
my ($self, $args) = @_;
my $source_id = $args->{'source_id'};
my $species_id = $args->{'species_id'};
my $files = $args->{'files'};
my $release_file = $args->{'rel_file'};
my $notify = sub { print @_ if $args->{'verbose'} };
my $file = ref $files eq 'ARRAY' ? shift @$files : '';
if ( $file ) {
$notify->(sprintf "%s parsing file '$file'\n", __PACKAGE__);
}
else {
printf STDERR "%s called without a 'files' argument\n%s",
__PACKAGE__, Dumper($args);
return 1; # error
}
# get the 'PO' or 'GO' source_name
my $source_name = $self->get_source_name_for_source_id($source_id);
# get the "main" GO/PO source id.
$source_id = $self->get_source_id_for_source_name( $source_name, 'main' );
$species_id ||= $self->get_species_id_for_filename($file);
my %species_id2name = $self->species_id2name();
my $species = $species_id2name{$species_id}->[0];
$notify->("Parsing $source_name for $species\n");
# These Ontology xrefs depend on gene ID xrefs (TAIR_LOCUS) having been
# previously loaded - get the source.
my $gene_source_id = $self->get_source_id_for_source_name('TAIR_LOCUS');
#
# Get mappimg from GO terms to descriptions from the obo file.
#
# The gene_association file in GAF format
# http://www.geneontology.org/GO.format.gaf-2_0.shtml
# The obo file (xml or plain)
my %id_to_term;
if ( my $file_desc = shift @$files ) {
my $obo_io = $self->get_filehandle($file_desc);
if ( !defined $obo_io ) {
print STDERR "ERROR: Could not open $file_desc\n";
return 1; # 1 error
}
my $term = undef;
my $desc = undef;
if ( $file_desc =~ /xml/ ) { # xml file format
while ( $_ = $obo_io->getline() ) {
if (/\<id\>([G|P]O:\d+)\<\/id\>/) {
$term = $1;
}
elsif (/\<name\>(.*)\<\/name\>/) {
if ( defined($term) ) {
$id_to_term{$term} = $1;
}
$term = undef;
}
}
}
else {
while ( $_ = $obo_io->getline() ) {
if (/^id:\s+([G|P]O:\d+)/) {
$term = $1;
}
elsif (/^name:\s+(.*)/) {
if ( defined($term) ) {
$id_to_term{$term} = $1;
}
$term = undef;
}
}
}
$obo_io->close();
$notify->(
sprintf "Mapped %s terms to descriptions\n",
scalar keys %id_to_term
);
}
my %wrongtype;
# Get a list of taxon IDs given that some species have multiple e.g. strains
my %sp2tax = $self->species_id2taxonomy();
my @tax_ids = @{ $sp2tax{ $species_id } };
# Loop for each taxon
my $count = 0;
my $miss_parse = 0;
my $miss_get_xref = 0;
for my $tax_id ( @tax_ids ) {
$notify->("processing for taxon: $tax_id\n");
my $taxon_line = "taxon:" . $tax_id;
# Open the gene association GAF file
my $gaf_io = $self->get_filehandle( $file );
if ( !defined $gaf_io ) {
print STDERR "ERROR: Could not open $file\n";
return 1; # error
}
while ( my $line = $gaf_io->getline() ) {
next if $line =~ /^!/; # comment
# next unless /$taxon_line/; # only process data for this taxon
chomp;
my (
$DB, $DB_Object_ID,
$DB_Object_Symbol, $Qualifier,
$GO_ID, $DB_Reference,
$Evidence_Code, $With,
$Aspect, $DB_Object_Name,
$DB_Object_Synonym, $DB_Object_Type,
$Taxon, $Date,
$Assigned_By, $Annotation_Extension,
$Gene_Product_Form_ID, @extra
) = split( /\t/, $line );
# next unless $Taxon eq $taxon_line; # Only process this species
next if $Qualifier eq "NOT"; # Skip "NOT" terms entirely
$DB_Object_Name =~ s/\'/\\\'/g;
my $master = 0;
# in po file stable_id=$array[9] # in GO asso file, stable_id=
# ??[9] [10]
# Find the stable_id in either DB_Object_Name or DB_Object_Synonym
if ( my $stable_id = $DB_Object_Name || $DB_Object_Synonym ) {
my $xref_id = $self->add_xref({
acc => $stable_id,
label => '',
desc => $nasc_gene_id,
source_id => $nascg_source_id,
species_id => $species_id,
info_type => 'DIRECT',
});
$self->add_direct_xref(
$tairg_xref_id, $gene_stable_id, 'Gene', 'DIRECT'
);
$count++;
}
$gaf_io->close();
}
$notify->(
map { " - $_\n" }
"$count $source_name dependent xrefs added",
"$miss_parse lines did not contain a recognised stable_id",
"$miss_get_xref lines contained a stable_id that did not " .
"correspond with a known xref",
);
if ( defined $release_file ) {
# Parse and set release information from $release_file.
my $release_io = $self->get_filehandle($release_file);
# Slurp mode.
local $/;
my $release = <$release_io>;
$release_io->close();
$release =~ tr/\n/ /;
$release =~ s#.*The following table describes.*?of (POC.*?)<ul>.*#$1#;
$release =~ s#<[^>]+>##g;
$notify->("$source_name release: '$release'\n");
$self->set_release( $source_id, $release );
}
return 0; # success
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment