From 2bf1df17d32bdd74dcaa0e79301883495b674292 Mon Sep 17 00:00:00 2001 From: Ian Longden <ianl@sanger.ac.uk> Date: Thu, 7 Oct 2004 16:42:24 +0000 Subject: [PATCH] xref sequence dumps are now configurable. See get_set_lists method --- .../xref_mapping/XrefMapper/BasicMapper.pm | 259 ++++++++++++++---- 1 file changed, 201 insertions(+), 58 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm index 558e414cab..2956ccbefe 100644 --- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm @@ -33,36 +33,51 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk sub dump_seqs{ my ($self, $xref) = @_; - $self->dump_ensembl(); $self->dump_xref($xref); + $self->dump_ensembl(); } sub run_matching{ + my ($self,$xref) = @_; + + my @list=(); + + my $i = 0; + foreach my $method (@{$self->method()}){ + my @dna=(); + push @dna, $method; + push @dna, $xref->dir."/xref_".$i."_dna.fasta"; + push @dna, $self->ensembl_dna_file(); + push @list, \@dna; + my @pep=(); + push @pep, $method; + push @pep, $xref->dir."/xref_".$i."_prot.fasta"; + push @pep, $self->ensembl_protein_file(); + push @list, \@pep; + $i++; + } + + glenn_call(\@list); print "NOT done yet:-)\n"; } + +sub glenn_call{ + my ($lists) = @_; + foreach my $list (@$lists){ + my ($meth, $q ,$t) = @$list; + print "HELLO: ".$meth."\n\t".$q."\n\t".$t."\n\n"; + } +} + sub store{ print "NOT done yet Either :-)\n"; } -sub dump_xref{ - my ($self,$xref) = @_; +sub get_species_id_from_species_name{ + my ($xref,$species) = @_; - if(!defined($xref->species())){ - $xref->species($self->species); - } - if(!defined($xref->dir())){ - if(defined($self->species)){ - $xref->species($self->species) - } - else{ - $xref->species("."); - } - } - # - # the species specified must be in the database and hence have a species_id - # - my $sql = "select species_id from species where name = '".$xref->species."'"; + my $sql = "select species_id from species where name = '".$species."'"; my $dbi = $xref->dbi(); my $sth = $dbi->prepare($sql); $sth->execute(); @@ -71,68 +86,188 @@ sub dump_xref{ if (defined @row) { $species_id = $row[0]; } else { - print STDERR "Couldn't get ID for species ".$xref->species."\n"; + print STDERR "Couldn't get ID for species ".$species."\n"; print STDERR "It must be one of :-\n"; $sql = "select name from species"; - $sth = dbi()->prepare($sql); + $sth = $dbi->prepare($sql); $sth->execute(); while(my @row = $sth->fetchrow_array()){ print STDERR $row[0]."\n"; } - die("Please try again\n"); + die("Please try again :-)\n"); } + return $species_id; +} - # - # Dump out the sequences where the species has an id of species_id - # and the sequence type is 'dna' - # - $self->xref_dna_file($xref->dir."/".$xref->species."_xref_dna.fasta"); - open(XDNA,">".$xref->dir."/".$xref->species."_xref_dna.fasta") || die "Could not open xref_dna.fasta"; - my $sql = "select p.xref_id, p.sequence from primary_xref p, xref x "; - $sql .= "where p.xref_id = x.xref_id and "; - $sql .= " p.sequence_type ='dna' and "; - $sql .= " x.species_id = ".$species_id." "; - $sth = $xref->dbi()->prepare($sql); + +sub get_set_lists{ + my ($self) = @_; + + return [["method1",["homo_sapiens","RefSeq"],["homo_sapiens","UniProtSwissProt"]], + ["method2",[$self->species,"*"]], + ["method3",["*","*"]]]; +} + + +sub get_source_id_from_source_name{ + my ($xref, $source) = @_; + my $source_id; + + my $sql = "select source_id from source where name = '".$source."'"; + my $dbi = $xref->dbi(); + my $sth = $dbi->prepare($sql); + $sth->execute(); + my @row = $sth->fetchrow_array(); + if (defined $row[0] and $row[0] ne '') { + $source_id = $row[0]; +# print $source."\t*".$row[0]."*\n"; + } else { + print STDERR "Couldn't get ID for source ".$source."\n"; + print STDERR "It must be one of :-\n"; + $sql = "select name from source"; + $sth = $dbi->prepare($sql); + $sth->execute(); + while(my @row = $sth->fetchrow_array()){ + print STDERR $row[0]."\n"; + } + die("Please try again :-)\n"); + } + return $source_id; +} + +sub dump_xref{ + my ($self,$xref) = @_; + + if(!defined($xref->dir())){ + if(defined($self->dir)){ + $xref->species($self->dir); + } + else{ + $xref->dir("."); + } + } + + + my @method=(); + + my @lists =@{$self->get_set_lists()}; + + my $i=0; + foreach my $list (@lists){ + print "method->".@$list[0]."\n"; + $method[$i] = shift @$list; + my $j = 1; + my @source_id-(); + my @species_id=(); + foreach my $element (@$list){ + while(my $species = shift(@$element)){ + # print $j.")\t".$species."\n"; + if($species ne "*"){ + $species_id[$j] = get_species_id_from_species_name($xref,$species); + } + else{ + $species_id[$j] = -1; + } + my $source = shift(@$element); + if($source ne "*"){ + $source_id[$j] = get_source_id_from_source_name($xref,$source); + } + else{ + $source_id[$j] = -1; + } + print $j."\t".$source. "\t".$source_id[$j] ."\n"; + print $j."\t".$species."\t".$species_id[$j]."\n"; + $j++; + } + } + #method data fully defined now + dump_subset($xref,\@species_id,\@source_id,$i); + $i++; + } + + $self->method(\@method); + + return; + +} + + +sub dump_subset{ + my ($xref,$rspecies_id,$rsource_id,$index) = @_; + + open(XDNA,">".$xref->dir()."/xref_".$index."_dna.fasta") + || die "Could not open xref_".$index."_dna.fasta"; + + my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id "; + $sql .= " from primary_xref p, xref x "; + $sql .= " where p.xref_id = x.xref_id and "; + $sql .= " p.sequence_type ='dna' "; + + + for (my $j =1; $j<scalar(@$rspecies_id); $j++){ + print $j."\t".$$rspecies_id[$j]."\t".$$rsource_id[$j]."\n"; + } + # return $xref->dir."/xref_".$i."_dna.fasta"; + + my $sth = $xref->dbi()->prepare($sql); $sth->execute(); my $i = 0; while(my @row = $sth->fetchrow_array()){ - $i++; - $row[1] =~ s/(.{60})/$1\n/g; - print XDNA ">".$row[0]."\n".$row[1]."\n"; - if($i > 10){ - goto ENDDNA; + my $pass = 0; + for (my $j =1; $j<scalar(@$rspecies_id); $j++){ + if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){ + if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){ + $pass = 1; + } + } + } + if($pass){ + $i++; + $row[1] =~ s/(.{60})/$1\n/g; + print XDNA ">".$row[0]."\n".$row[1]."\n"; + if($i > 10){ + goto ENDDNA; + } } } -ENDDNA: + ENDDNA: close XDNA; - # - # Dump out the sequences where the species has an id of species_id - # and the sequence type is 'peptide' - # - $self->xref_protein_file($xref->dir."/".$self->species."_xref_prot.fasta"); - open(XPEP,">".$xref->dir."/".$self->species."_xref_prot.fasta") || die "Could not open xref_prot.fasta"; - $sql = "select p.xref_id, p.sequence from primary_xref p, xref x "; - $sql .= "where p.xref_id = x.xref_id and "; - $sql .= " p.sequence_type ='peptide' and "; - $sql .= " x.species_id = ".$species_id." "; + + open(XPRO,">".$xref->dir."/xref_".$index."_prot.fasta") + || die "Could not open xref_".$index."_prot.fasta"; + my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id "; + $sql .= " from primary_xref p, xref x "; + $sql .= " where p.xref_id = x.xref_id and "; + $sql .= " p.sequence_type ='peptide' "; + + $sth = $xref->dbi()->prepare($sql); $sth->execute(); $i = 0; while(my @row = $sth->fetchrow_array()){ - $i++; - $row[1] =~ s/(.{60})/$1\n/g; - print XPEP ">".$row[0]."\n".$row[1]."\n"; - if($i > 10){ - goto ENDXPEP; + my $pass = 0; + for (my $j =1; $j<scalar(@$rspecies_id); $j++){ + if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){ + if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){ + $pass = 1; + } + } + } + if($pass){ + $i++; + $row[1] =~ s/(.{60})/$1\n/g; + print XPRO ">".$row[0]."\n".$row[1]."\n"; + if($i > 10){ + goto ENDPRO; + } } } -ENDXPEP: - close XPEP; + ENDPRO: + close XPRO; } - sub dump_ensembl{ my ($self) = @_; @@ -177,14 +312,14 @@ sub fetch_and_dump_seq{ foreach my $gene_id (@gene_ids){ $i++; my $gene = $gene_adap->fetch_by_dbID($gene_id); - print "gene ".$gene."\n"; +# print "gene ".$gene."\n"; foreach my $transcript (@{$gene->get_all_Transcripts()}) { my $seq = $transcript->spliced_seq(); $seq =~ s/(.{60})/$1\n/g; print DNA ">" . $transcript->dbID() . "\n" .$seq."\n"; my $trans = $transcript->translation(); my $translation = $transcript->translate(); - print "tranlation ".$translation."\n"; +# print "tranlation ".$translation."\n"; my $pep_seq = $translation->seq(); $pep_seq =~ s/(.{60})/$1\n/g; print PEP ">".$trans->dbID()."\n".$pep_seq."\n"; @@ -236,6 +371,14 @@ sub ensembl_dna_file{ return $self->{_ens_dna_file}; } +sub method{ + my ($self, $arg) = @_; + + (defined $arg) && + ($self->{_method} = $arg ); + return $self->{_method}; +} + #sub get_ensembl_type{ # my %type; -- GitLab