Skip to content
Snippets Groups Projects
Commit f27e3298 authored by Ian Longden's avatar Ian Longden
Browse files

wormBase stuff added

parent 112ed948
No related branches found
No related tags found
No related merge requests found
......@@ -260,6 +260,52 @@ sub get_source_id_for_source_name {
return $source_id;
}
sub get_valid_xrefs_for_dependencies{
my ($self, $dependent_name, @reverse_ordered_source_list) = @_;
my %dependent_2_xref;
my $sql = "select source_id from source where name =?";
my $sth = dbi()->prepare($sql);
my @dependent_sources;
$sth->execute($dependent_name);
while(my @row = $sth->fetchrow_array()){
push @dependent_sources,$row[0];
}
my @sources;
foreach my $name (@reverse_ordered_source_list){
$sth->execute($name);
while(my @row = $sth->fetchrow_array()){
push @sources,$row[0];
}
}
$sth->finish;
$sql = "select d.dependent_xref_id, x2.accession ";
$sql .= " from dependent_xref d, xref x1, xref x2 ";
$sql .= " where x1.xref_id = d.master_xref_id and";
$sql .= " x1.source_id=? and ";
$sql .= " x2.xref_id = d.dependent_xref_id and";
$sql .= " x2.source_id=? ";
my $sth = dbi()->prepare($sql);
foreach my $d (@dependent_sources){
foreach my $s (@sources){
$sth->execute($s,$d);
while(my @row = $sth->fetchrow_array()){
$dependent_2_xref{$row[1]} = $row[0];
}
}
}
return \%dependent_2_xref;
}
sub get_valid_codes{
my ($self,$source_name,$species_id) =@_;
my %valid_codes;
......
......@@ -31,6 +31,7 @@ sub run {
my $file = shift;
my $source_id = shift;
my $species_id = shift;
my %wrongtype;
if(!defined($source_id)){
$source_id = XrefParser::BaseParser->get_source_id_for_filename($file);
......@@ -42,13 +43,15 @@ sub run {
my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)};
my (%refseq) = %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)};
my %worm;
my $wormset;
my $count = 0;
open(GO,"<".$file) || die "Could not open $file\n";
my $taxon_line = "taxon:".$species_id;
my $miss =0;
while (<GO>) {
if(/$taxon_line/){
chomp;
......@@ -73,8 +76,28 @@ sub run {
$count++;
}
}
else{
print STDERR "unknown type ".$array[0]."\n";
elsif($array[0] =~ /^WB$/){
#WB CE20707 ZYG-9 GO:0008017 WB:WBPaper00003099|PMID:9606208 ISS F protein taxon:6239 20030829 WB
if(!defined($wormset)){
$wormset = 1;
%worm = %{XrefParser::BaseParser->get_valid_xrefs_for_dependencies
('wormbase_transcript','Uniprot/SPTREMBL','RefSeq_peptide',
'Uniprot/SWISSPROT')};
}
if(defined($worm{$array[2]})){
XrefParser::BaseParser->add_to_xrefs($worm{$array[2]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id);
$count++;
}
else{
$miss++;
if($miss < 10){
print "miss: ".$array[2]."\n";
}
}
}
elsif(!defined($wrongtype{$array[0]})){
print STDERR "WARNING: unknown type ".$array[0]."\n";
$wrongtype{$array[0]} = 1;
}
}
}
......
......@@ -81,7 +81,17 @@ sub create_xrefs {
$type = 'dna';
$source_id = $dna_source_id;
} else{
} elsif($file =~ /RefSeq_dna/){
$type = 'dna';
$source_id = $dna_source_id;
} elsif($file =~ /RefSeq_protein/){
$type = 'peptide';
$source_id = $peptide_source_id;
}else{
die "Could not work out sequence type & source for $file\n";
}
......@@ -145,6 +155,7 @@ sub create_xrefs {
my @LocusIDline = $entry =~ /db_xref=.LocusID:(\d+)/g;
my @EntrezGeneIDline = $entry =~ /db_xref=.GeneID:(\d+)/g;
my @mimline = $entry =~ /db_xref=.MIM:(\d+)/g;
my @wormline = $entry =~ /db_xref=\"WormBase:(\S+)\"/g;
foreach my $ll (@LocusIDline) {
my %dep;
......@@ -153,6 +164,21 @@ sub create_xrefs {
$dep{ACCESSION} = $ll;
push @{$xref->{DEPENDENT_XREFS}}, \%dep;
}
foreach my $ll (@wormline) {
my %dep;
$dep{SOURCE_ID} = $dependent_sources{'wormbase_transcript'};
$dep{LINKAGE_SOURCE_ID} = $source_id;
$dep{ACCESSION} = $ll;
push @{$xref->{DEPENDENT_XREFS}}, \%dep;
if($ll =~ /(\S+\.\d+)/){
my $temp = $1;
my %dep2;
$dep2{SOURCE_ID} = $dependent_sources{'wormbase_gene'};
$dep2{LINKAGE_SOURCE_ID} = $source_id;
$dep2{ACCESSION} = $temp;
push @{$xref->{DEPENDENT_XREFS}}, \%dep2;
}
}
foreach my $ll (@EntrezGeneIDline) {
my %dep;
$dep{SOURCE_ID} = $dependent_sources{EntrezGene};
......
......@@ -224,12 +224,8 @@ sub create_xrefs {
my @dep_lines = split /\n/, $deps;
foreach my $dep (@dep_lines) {
if ($dep =~ /^DR\s+(.+)/) {
# print $dep."\n";
my ($source, $acc, @extra) = split /;\s*/, $1;
# print "source is $source \n";
# print "acc is $acc \n";
if (exists $dependent_sources{$source}) {
# print "EXISTS\n";
# create dependent xref structure & store it
my %dep;
$dep{SOURCE_NAME} = $source;
......@@ -238,7 +234,6 @@ sub create_xrefs {
$dep{ACCESSION} = $acc;
push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
if($dep =~ /EMBL/){
# print "prtein_id is ".$extra[0]."\n";
my ($protein_id) = $extra[0];
if($protein_id ne "-"){
my %dep2;
......@@ -250,6 +245,39 @@ sub create_xrefs {
}
}
}
elsif($source =~ /WormPep/){
#DR WormPep; F35D6.1b; CE28299.
my %dep;
$dep{SOURCE_NAME} = 'wormbase_transcript';
$dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
$dep{SOURCE_ID} = $dependent_sources{'wormbase_transcript'};
$dep{ACCESSION} = $acc;
push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
my $gene;
if($acc =~ /(\S+\.\d+)/){
$gene = $1;
my %dep2;
$dep2{SOURCE_NAME} = 'wormbase_gene';
$dep2{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
$dep2{SOURCE_ID} = $dependent_sources{'wormbase_gene'};
$dep2{ACCESSION} = $gene;
push @{$xref->{DEPENDENT_XREFS}}, \%dep2; # array of hashrefs
}
my %dep3;
my $worm_id ;
if($extra[0] =~ /([^.]+)/){
$worm_id = $1;
}
$dep3{SOURCE_NAME} = 'wormpep_id';
$dep3{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
$dep3{SOURCE_ID} = $dependent_sources{'wormpep_id'};
$dep3{ACCESSION} = $worm_id;
push @{$xref->{DEPENDENT_XREFS}}, \%dep3; # array of hashrefs
# print $worm_id."\t".$gene."\t".$acc."\n";
}
}
}
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment