Skip to content
Snippets Groups Projects
Commit 10ee3fc2 authored by Glenn Proctor's avatar Glenn Proctor
Browse files

Added harvester to automatically download SwissProt files, and BaseHarvester...

Added harvester to automatically download SwissProt files, and BaseHarvester to allow all harvesters to be discovered and run automatically. Various tweaks to all files to make them cleaner and more consistent
parent 0a922a74
No related branches found
No related tags found
No related merge requests found
# Superclass providing common functionality for harvesters
# as well as the ability to run them all
package BaseHarvester;
use strict;
# --------------------------------------------------------------------------------
if (!defined(caller())) {
find_and_run_harvesters();
}
# --------------------------------------------------------------------------------
# Find subclasses and run them
sub find_and_run_harvesters {
foreach my $file (glob("*.pm")) {
if (eval { require $file }) {
$file =~ s/\..*$//;
my $obj = $file->new();
if ($obj->isa("BaseHarvester") && $file !~ /BaseHarvester/) {
print "Running " . $file . "\n";
$obj->run();
}
} else {
warn("Error during require of " . $file . "\n");
}
}
}
# --------------------------------------------------------------------------------
sub taxonomy_ids {
my $self = shift;
return (4530, 55529);
}
# --------------------------------------------------------------------------------
sub new {
my $self = {};
bless $self, "BaseHarvester";
return $self;
}
# --------------------------------------------------------------------------------
1;
# Harvester for downloading SwissProt files
package SwissProtHarvester;
use strict;
use vars qw(@ISA);
@ISA = qw(BaseHarvester);
# --------------------------------------------------------------------------------
run() if (!defined(caller()));
# --------------------------------------------------------------------------------
sub run {
# URL should end with a /
my $url = "ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/";
my @taxons = BaseHarvester->taxonomy_ids();
my $ext = ".SPC";
foreach my $taxon (@taxons) {
my $file = $url . $taxon . $ext;
my $result = system("wget", "--quiet", "--timestamping", $file);
}
}
# TODO logging, error handling
# --------------------------------------------------------------------------------
sub new {
my $self = {};
bless $self, "SwissProtHarvester";
return $self;
}
# --------------------------------------------------------------------------------
1;
# Parse SwissProt files to create xrefs.
package SwissProtParser;
use strict;
use POSIX qw(strftime);
use DBI;
use Data::Dumper;
use File::Basename;
use BaseParser;
use vars qw(@ISA);
@ISA = qw(BaseParser);
# --------------------------------------------------------------------------------
# Parse command line and run if being run directly
if (!defined(caller())) {
if (scalar(@ARGV) != 1) {
print "\nUsage: SwissProtParser.pm file.SPC\n\n";
exit(1);
}
run();
}
# --------------------------------------------------------------------------------
sub run {
my $file = $ARGV[0];
my $species_id = get_species($file);
my $source_id = BaseParser->upload_source(create_source($file));
BaseParser->upload_xrefs(create_xrefs($source_id, $species_id, $file));
}
# --------------------------------------------------------------------------------
# Get species from file
# For SwissProt files the filename is the taxonomy ID
sub get_species {
my ($file) = @_;
my ($species_id, $extension) = split(/\./, basename($file));
my $sth = BaseParser->dbi()->prepare("SELECT name FROM species WHERE taxonomy_id=?");
$sth->execute($species_id);
my $species_name;
while(my @row = $sth->fetchrow_array()) {
$species_name = $row[0];
}
$sth->finish;
if (defined $species_name) {
print "Taxonomy ID " . $species_id . " corresponds to " . $species_name . "\n";
} else {
print "Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n";
exit(1);
}
return $species_id;
}
# --------------------------------------------------------------------------------
# Create source object to be loaded into source table
sub create_source {
my ($file) = @_;
my $source;
my $file_date = POSIX::strftime('%Y%m%d%H%M%S', localtime((stat($file))[9]));
$source = { NAME => "SwissProt",
URL => $file,
FILE_MODIFIED_DATE => $file_date
# TODO URL? Release?
};
return $source;
}
# --------------------------------------------------------------------------------
# Parse file into array of xref objects
sub create_xrefs {
my ($source_id, $species_id, $file) = @_;
open(SWISSPROT, $file) || die "Can't open Swissprot file $file\n";
my @xrefs;
my $previous_rs = $/;
$/ = "\/\/\n";
while (<SWISSPROT>) {
my $xref;
($xref->{ACCESSION}) =$_ =~ /AC\s+(\w+);/;
($xref->{LABEL}) = $_ =~ /DE\s+(.+)/;
($xref->{SPECIES_ID}) = $species_id;
($xref->{SOURCE_ID}) = $source_id;
# extract sequence
my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline
my @seq_lines = split /\n/, $seq;
my $parsed_seq = "";
foreach my $x (@seq_lines) {
$parsed_seq .= $x;
}
$parsed_seq =~ s/\/\///g; # remove trailing end-of-record character
$parsed_seq =~ s/\s//g; # remove whitespace
$parsed_seq =~ s/^.*;//g; # remove everything before last ;
$xref->{SEQUENCE} = $parsed_seq;
#print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";
push @xrefs, $xref;
}
$/ = $previous_rs;
print "Read " . scalar(@xrefs) ." xrefs from $file\n";
return @xrefs;
}
# --------------------------------------------------------------------------------
sub new {
my $self = {};
bless $self, "SwissProtParser";
return $self;
}
# --------------------------------------------------------------------------------
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment