Skip to content
Snippets Groups Projects
Commit c8e314fe authored by Glenn Proctor's avatar Glenn Proctor
Browse files

Parser for each file is now stored directly in the source_url table rather...

Parser for each file is now stored directly in the source_url table rather than hardcoded in BaseParser.
parent 5b63dd63
No related branches found
No related tags found
No related merge requests found
......@@ -23,12 +23,6 @@ my %dependent_sources;
my %taxonomy2species_id;
my %name2species_id;
my %filetype2parser = (
"UniProtSwissProt" => "SwissProtParser",
"UniProtTrEMBL" => "SwissProtParser",
"RefSeq" => "RefSeqParser"
);
run() if (!defined(caller()));
# --------------------------------------------------------------------------------
......@@ -37,10 +31,10 @@ run() if (!defined(caller()));
sub run {
my $dbi = dbi();
my $sth = $dbi->prepare("SELECT s.source_id, su.source_url_id, s.name, su.url, su.checksum FROM source s, source_url su WHERE s.download='Y' AND su.source_id=s.source_id ORDER BY s.name");
my $sth = $dbi->prepare("SELECT s.source_id, su.source_url_id, s.name, su.url, su.checksum, su.parser FROM source s, source_url su WHERE s.download='Y' AND su.source_id=s.source_id ORDER BY s.name");
$sth->execute();
my ($source_id, $source_url_id, $name, $url, $checksum);
$sth->bind_columns(\$source_id, \$source_url_id, \$name, \$url, \$checksum);
my ($source_id, $source_url_id, $name, $url, $checksum, $parser);
$sth->bind_columns(\$source_id, \$source_url_id, \$name, \$url, \$checksum, \$parser);
my $last_type = "";
my $dir;
while (my @row = $sth->fetchrow_array()) {
......@@ -78,9 +72,8 @@ sub run {
update_source($dbi, $source_url_id, $file_cs, $file);
my $parserType = $filetype2parser{$type};
print "Parsing $file with $parserType\n";
$parserType->run("$dir/$file", $source_id);
print "Parsing $file with $parser\n";
$parser->run("$dir/$file", $source_id);
} else {
......@@ -373,7 +366,7 @@ sub insert_or_select {
if ($error) {
$id = get_xref_id_by_accession_and_source($acc, $source);
print "Got existing xref id " . $id . " for " . $acc . " " . $source . "\n";
#print "Got existing xref id " . $id . " for " . $acc . " " . $source . "\n";
} else {
......
......@@ -83,6 +83,7 @@ CREATE TABLE source_url (
checksum varchar(255),
file_modified_date datetime,
upload_date datetime,
parser varchar(255),
PRIMARY KEY (source_url_id),
KEY source_idx(source_id)
......@@ -107,49 +108,51 @@ INSERT INTO source VALUES (1002, 'MEDLINE', 1, 'N');
# --------------------------------------------------------------------------------
# UniProt/SwissProt
# Note currently no UniProt/SwissProt data for fugu, anopheles, c.briggsae or chicken.
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/9606.SPC', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/10090.SPC', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/10116.SPC', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/7227.SPC', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/6239.SPC', '', now(), now());
##INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/9031.SPC', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/9598.SPC', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/9606.SPC', '', now(), now(), "SwissProtParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/10090.SPC', '', now(), now(), "SwissProtParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/10116.SPC', '', now(), now(), "SwissProtParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/7227.SPC', '', now(), now(), "SwissProtParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/6239.SPC', '', now(), now(), "SwissProtParser");
##INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/9031.SPC', '', now(), now(), "SwissProtParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (1, 'ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/swissprot_files/proteomes/9598.SPC', '', now(), now(), "SwissProtParser");
# --------------------------------------------------------------------------------
# RefSeq - release/ and cumulative/ directories, for protein and mRNA
# release/protein
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian1.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian2.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian3.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian4.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian5.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian6.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian7.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian8.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian9.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian10.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian11.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian12.protein.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian13.protein.faa.gz', '', now(), now());
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian1.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian2.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian3.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian4.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian5.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian6.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian7.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian8.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian9.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian10.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian11.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian12.protein.faa.gz', '', now(), now(), "RefSeqParser");
#INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian13.protein.faa.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian1.protein.gpff.gz', '', now(), now(), "RefSeqGPFFParser");
# release/rna
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian1.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian2.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian3.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian4.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian5.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian6.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian7.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian8.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian9.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian10.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian11.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian12.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian13.rna.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian1.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian2.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian3.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian4.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian5.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian6.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian7.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian8.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian9.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian10.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian11.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian12.rna.fna.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/release/vertebrate_mammalian/vertebrate_mammalian13.rna.fna.gz', '', now(), now(), "RefSeqParser");
# cumulative
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/cumulative/rscu.faa.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/cumulative/rscu.fna.gz', '', now(), now());
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/cumulative/rscu.faa.gz', '', now(), now(), "RefSeqParser");
INSERT INTO source_url (source_id, url, checksum, file_modified_date, upload_date, parser) VALUES (2, 'ftp://ftp.ncbi.nih.gov/refseq/cumulative/rscu.fna.gz', '', now(), now(), "RefSeqParser");
################################################################################
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment