From a1411221c7a289c7ccc07e2c6ea8c6273ddc62ae Mon Sep 17 00:00:00 2001 From: Glenn Proctor <gp1@sanger.ac.uk> Date: Wed, 8 Dec 2004 15:57:15 +0000 Subject: [PATCH] Changed xref dumping to generate SQL to retrieve only those xrefs we need. --- .../xref_mapping/XrefMapper/BasicMapper.pm | 124 +++++++++--------- 1 file changed, 59 insertions(+), 65 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm index 90997234f8..c1470265e1 100644 --- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm @@ -149,8 +149,8 @@ sub get_species_id_from_species_name{ sub get_set_lists{ my ($self) = @_; -# return [["ExonerateGappedBest1", ["homo_sapiens","Uniprot/SWISSPROT"]]]; -return [["ExonerateGappedBest1", ["homo_sapiens","*"]]]; + # return [["ExonerateGappedBest1", ["homo_sapiens","Uniprot/SWISSPROT"]]]; + return [["ExonerateGappedBest1", ["homo_sapiens","*"]]]; } @@ -246,7 +246,7 @@ sub dump_xref{ foreach my $list (@lists){ # print "method->".@$list[0]."\n"; $method[$i] = shift @$list; - my $j = 1; + my $j = 0; my @source_id=(); my @species_id=(); foreach my $element (@$list){ @@ -297,72 +297,64 @@ sub dump_xref{ sub dump_subset{ + my ($self,$xref,$rspecies_id,$rsource_id,$index) = @_; - - open(XDNA,">".$xref->dir()."/xref_".$index."_dna.fasta") - || die "Could not open xref_".$index."_dna.fasta"; - my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id "; - $sql .= " from primary_xref p, xref x "; - $sql .= " where p.xref_id = x.xref_id and "; - $sql .= " p.sequence_type ='dna' "; - if(defined($self->maxdump())){ - $sql .= "limit ".$self->maxdump()." "; - } - -# for (my $j =1; $j<scalar(@$rspecies_id); $j++){ -# print $j."\t".$$rspecies_id[$j]."\t".$$rsource_id[$j]."\n"; -# } - # return $xref->dir."/xref_".$i."_dna.fasta"; - - my $sth = $xref->dbi()->prepare($sql); - $sth->execute(); - while(my @row = $sth->fetchrow_array()){ - my $pass = 0; - for (my $j =1; $j<scalar(@$rspecies_id); $j++){ - if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){ - if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){ - $pass = 1; - } - } + # generate or condition list for species and sources + my $final_clause; + my $use_all = 0; + my @or_list; + for (my $j = 0; $j < scalar(@$rspecies_id); $j++){ + my @condition; + if($$rspecies_id[$j] > 0){ + push @condition, "x.species_id=" . $$rspecies_id[$j]; } - if($pass){ - $row[1] =~ s/(.{60})/$1\n/g; - print XDNA ">".$row[0]."\n".$row[1]."\n"; + if($$rsource_id[$j] > 0){ + push @condition, "x.source_id=" . $$rsource_id[$j]; } - } - close XDNA; + # note if both source and species are * (-1) there's no need for a final clause + + if ( !@condition ) { + $use_all = 1; + last; + } + + push @or_list, join (" AND ", @condition); - open(XPRO,">".$xref->dir."/xref_".$index."_prot.fasta") - || die "Could not open xref_".$index."_prot.fasta"; - my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id "; - $sql .= " from primary_xref p, xref x "; - $sql .= " where p.xref_id = x.xref_id and "; - $sql .= " p.sequence_type ='peptide' "; - if(defined($self->maxdump())){ - $sql .= "limit ".$self->maxdump()." "; } - - - $sth = $xref->dbi()->prepare($sql); - $sth->execute(); - while(my @row = $sth->fetchrow_array()){ - my $pass = 0; - for (my $j =1; $j<scalar(@$rspecies_id); $j++){ - if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){ - if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){ - $pass = 1; - } - } + + $final_clause = " AND ((" . join(") OR (", @or_list) . "))" unless ($use_all) ; + + + for my $sequence_type ('dna', 'peptide') { + + my $filename = $xref->dir() . "/xref_" . $index . "_" . $sequence_type . ".fasta"; + open(XREF_DUMP,">$filename") || die "Could not open $filename"; + + my $sql = "SELECT p.xref_id, p.sequence, x.species_id , x.source_id "; + $sql .= " FROM primary_xref p, xref x "; + $sql .= " WHERE p.xref_id = x.xref_id AND "; + $sql .= " p.sequence_type ='$sequence_type' "; + $sql .= $final_clause; + + if(defined($self->maxdump())){ + $sql .= " LIMIT ".$self->maxdump()." "; } - if($pass){ + + my $sth = $xref->dbi()->prepare($sql); + $sth->execute(); + while(my @row = $sth->fetchrow_array()){ + $row[1] =~ s/(.{60})/$1\n/g; - print XPRO ">".$row[0]."\n".$row[1]."\n"; + print XREF_DUMP ">".$row[0]."\n".$row[1]."\n"; + } + + close(XREF_DUMP); + $sth->finish(); + } - $sth->finish(); - close XPRO; } @@ -425,7 +417,7 @@ sub fetch_and_dump_seq{ } open(PEP,">".$self->ensembl_protein_file()) - || die("Could not open dna file for writing: ".$self->ensembl_protein_file."\n"); + || die("Could not open protein file for writing: ".$self->ensembl_protein_file."\n"); my $gene_adap = $db->get_GeneAdaptor(); my @gene_ids = @{$gene_adap->list_dbIDs()}; @@ -450,13 +442,16 @@ sub fetch_and_dump_seq{ print PEP ">".$trans->dbID()."\n".$pep_seq."\n"; } } - if(defined($max) and $i > $max){ - goto FIN; - } + + last if(defined($max) and $i > $max); + } -FIN: + close DNA; close PEP; + + print time() . " after \n"; + exit(0); } @@ -1225,7 +1220,6 @@ sub build_gene_display_xrefs { my $best_transcript_length = -1; foreach my $transcript_id (@transcripts) { if (!$transcript_display_xrefs->{$transcript_id}) { - #print "No display_xref assigned to transcript $transcript_id\n"; $trans_no_xref++; next; } else { @@ -1246,7 +1240,7 @@ sub build_gene_display_xrefs { if (!$best_xref) { #XXXwarn("Couldn't find a display xref for gene id $gene_id\n"); $miss++; - } else { + } else{ # Write record print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n"; $hit++; -- GitLab