Commit e1dd9393 authored by Arnaud Kerhornou's avatar Arnaud Kerhornou
Browse files

TAIR GO associatation file parser for A.thaliana

parent 7533ca3d
package XrefParser::TAIROntologyParser;
=pod
=head1 NAME
XrefParser::TAIROntologyParser
=head1 DESCRIPTION
Parse ontology files. File format (and example data):
1: DB, database contributing the file (always "TAIR" for this file).
2: DB_Object_ID (TAIR's unique identifier for genes).
3: DB_Object_Symbol, see below
4: Qualifier (optional), one or more of 'NOT', 'contributes_to',
'colocalizes_with' as qualifier(s) for a GO annotation, when needed,
multiples separated by pipe (|)
5: GO ID, unique numeric identifier for the GO term
6: DB:Reference(|DB:Reference), the reference associated with the GO
annotation
7: Evidence, the evidence code for the GO annotation
8: With (or) From (optional), any With or From qualifier for the GO
annotation
9: Aspect, which ontology the GO term belongs (Function, Process or
Component)
10: DB_Object_Name(|Name) (optional), a name for the gene product in
words, e.g. 'acid phosphatase'
11: DB_Object_Synonym(|Synonym) (optional), see below
12: DB_Object_Type, type of object annotated, e.g. gene, protein, etc.
13: taxon(|taxon), taxonomic identifier of species encoding gene
product
14: Date, date GO annotation was made in the format
15: Assigned_by, source of the annotation (either "TAIR" or "TIGR")
E.g.:
Field1: TAIR
Field2: locus:2031476
Field3: ENO1
Field4:
Field5: GO:0000015
Field6: TAIR:AnalysisReference:501748310
Field7: IEA
Field8: INTERPRO:IPR000941|INTERPRO:IPR020809|INTERPRO:IPR020810|INTERPRO:IPR020811
Field9: C
Field10: enolase 1
Field11: AT1G74030|AT1G74030.1|F2P9.10|F2P9_10
Field12: protein
Field13: taxon:3702
Field14: 20120418
Field15: TAIR
Field16:
Field17: TAIR:gene:2031475
Field1: TAIR
Field2: locus:2081840
Field3: LTP12
Field4:
Field5: PO:0001009
Field6: TAIR:Publication:501710265|AGRICOLA_IND:IND23314018
Field7: IDA
Field8:
Field9: G
Field10: AT3G51590
Field11: AT3G51590|LTP12|lipid transfer protein 12|T18N14.1
Field12: protein
Field13: taxon:3702
Field14: 20060215
Field15: TAIR
Field16:
Field17: TAIR:locus:2081840
=head1 AUTHOR
Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.
=cut
use strict;
use warnings;
use autodie;
use Data::Dumper;
use File::Basename 'basename';
use Readonly;
use List::MoreUtils 'uniq';
use XML::Simple 'XMLin';
use base qw( XrefParser::BaseParser );
Readonly my $DIRECT => 'DIRECT';
Readonly my $GENE => 'Gene';
Readonly my $TAIR_TRANSLATION => 'TAIR_TRANSLATION';
Readonly my @GAF_FIELDS => qw(
db
db_object_id
db_object_symbol
qualifier
ont_id
db_reference
evidence_code
with
aspect
db_object_name
db_object_synonym
db_object_type
taxon
date
assigned_by
annotation_extension
gene_product_form_id
);
# Creates ontology xrefs from a GAF annotation file.
# Also requires an OBO file mapping IDs to terms.
# The ontology xrefs are dependent on xrefs
#----------------------------------------------------------------------
sub run {
my ($self, $args) = @_;
my $source_id = $args->{'source_id'};
my $species_id = $args->{'species_id'};
my $files = $args->{'files'};
my $release_file = $args->{'rel_file'};
my $notify = sub { print @_ if $args->{'verbose'} };
unless ( ref $files eq 'ARRAY' ) {
die '"files" argument not an array?';
}
my ($ont_file) = grep { /\.(xml|obo)/ } @$files;
my ($assoc_file) = grep { $ont_file && !/$ont_file/ } @$files;
if ( $assoc_file ) {
$notify->(sprintf "%s parsing file '$assoc_file'\n", __PACKAGE__);
}
else {
printf STDERR "%s called without a 'files' argument\n%s",
__PACKAGE__, Dumper($args);
return 1; # error
}
# get the "main" GO/PO source id.
my $source_name = $self->get_source_name_for_source_id( $source_id );
$source_id = $self->get_source_id_for_source_name( $source_name, 'main' );
#
# Ontologies are attached to the translations, so this gets all the
# translation xrefs as a hash with the translation IDs as the keys
# and the xref IDs as the values.
#
my %master_xrefs = %{
$self->get_valid_codes( $TAIR_TRANSLATION , $species_id )
};
$species_id ||= $self->get_species_id_for_filename($assoc_file);
my %species_id2name = $self->species_id2name();
my $species = $species_id2name{$species_id}->[0];
$notify->("Parsing $source_name for $species\n");
#
# Get mappimg from GO terms to descriptions from the XML/OBO file
#
#
my %id_to_term;
if ( $ont_file ) {
my $obo_io = $self->get_filehandle( $ont_file );
if ( !defined $obo_io ) {
print STDERR "ERROR: Could not open '$ont_file'\n";
return 1; # 1 error
}
my $term = undef;
my $desc = undef;
if ( $ont_file =~ /\.xml$/ ) {
while ( $_ = $obo_io->getline() ) {
if (/\<id\>([G|P]O:\d+)\<\/id\>/) {
$term = $1;
}
elsif (/\<name\>(.*)\<\/name\>/) {
if ( defined($term) ) {
$id_to_term{$term} = $1;
}
$term = undef;
}
}
}
else {
while ( $_ = $obo_io->getline() ) {
if (/^id:\s+([G|P]O:\d+)/) {
$term = $1;
}
elsif (/^name:\s+(.*)/) {
if ( defined $term ) {
$id_to_term{ $term } = $1;
}
$term = undef;
}
}
}
$obo_io->close();
$notify->(
sprintf "Mapped %s terms to descriptions\n",
scalar keys %id_to_term
);
}
my %wrongtype;
my $num_terms = 0;
my $count = 0;
my $miss_parse = 0;
# Open the gene association GAF file
my $gaf_io = $self->get_filehandle( $assoc_file );
if ( !defined $gaf_io ) {
print STDERR "ERROR: Could not open $assoc_file\n";
return 1; # error
}
while ( my $line = $gaf_io->getline() ) {
next if $line =~ /^!/; # comment
chomp $line;
my @vals = split( /\t/, $line );
my %rec = map { $GAF_FIELDS[$_], $vals[$_] } 0..$#GAF_FIELDS;
next if $rec{'db'} ne 'TAIR'; # Only process TAIR annotations
next if $rec{'qualifier'} eq 'NOT'; # Skip "NOT" terms entirely
# Find the stable_id(s)
my @stable_ids = uniq(
map { defined $master_xrefs{ $_ } ? $_ : () }
grep { /^AT\d+G/ }
map { s/\'/\\\'/g; $_ }
map { split /[|]/ }
( $rec{'db_object_name'}, $rec{'db_object_synonym'} )
);
if ( !@stable_ids ) {
$miss_parse++;
next;
}
my $ont_id = $rec{'ont_id'};
my $desc = $id_to_term{ $ont_id } || '';
for my $stable_id ( @stable_ids ) {
printf "%-70s\r", sprintf(
"%10s: %s => %s", ++$num_terms, $ont_id, $stable_id
);
my @xref_ids = @{ $master_xrefs{ $stable_id } };
for my $xref_id ( @xref_ids ) {
$self->add_dependent_xref({
master_xref_id => $xref_id,
acc => $ont_id,
label => $ont_id,
desc => $desc,
linkage => $rec{'evidence_code'},
source_id => $source_id,
species_id => $species_id
});
}
}
$count++;
}
$gaf_io->close();
$notify->(
map { " - $_\n" }
"$count $source_name dependent xrefs added",
"$miss_parse lines did not contain a recognised stable_id",
);
if ( defined $release_file ) {
# Parse and set release information from $release_file.
my $release_io = $self->get_filehandle($release_file);
# Slurp mode.
local $/;
my $release = <$release_io>;
$release_io->close();
$release =~ tr/\n/ /;
$release =~ s#.*The following table describes.*?of (POC.*?)<ul>.*#$1#;
$release =~ s#<[^>]+>##g;
$notify->("$source_name release: '$release'\n");
$self->set_release( $source_id, $release );
}
return 0; # success
}
1;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment