Skip to content
Snippets Groups Projects
Commit 2bf1df17 authored by Ian Longden's avatar Ian Longden
Browse files

xref sequence dumps are now configurable. See get_set_lists method

parent 6ca670c0
No related branches found
No related tags found
No related merge requests found
...@@ -33,36 +33,51 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk ...@@ -33,36 +33,51 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk
sub dump_seqs{ sub dump_seqs{
my ($self, $xref) = @_; my ($self, $xref) = @_;
$self->dump_ensembl();
$self->dump_xref($xref); $self->dump_xref($xref);
$self->dump_ensembl();
} }
sub run_matching{ sub run_matching{
my ($self,$xref) = @_;
my @list=();
my $i = 0;
foreach my $method (@{$self->method()}){
my @dna=();
push @dna, $method;
push @dna, $xref->dir."/xref_".$i."_dna.fasta";
push @dna, $self->ensembl_dna_file();
push @list, \@dna;
my @pep=();
push @pep, $method;
push @pep, $xref->dir."/xref_".$i."_prot.fasta";
push @pep, $self->ensembl_protein_file();
push @list, \@pep;
$i++;
}
glenn_call(\@list);
print "NOT done yet:-)\n"; print "NOT done yet:-)\n";
} }
sub glenn_call{
my ($lists) = @_;
foreach my $list (@$lists){
my ($meth, $q ,$t) = @$list;
print "HELLO: ".$meth."\n\t".$q."\n\t".$t."\n\n";
}
}
sub store{ sub store{
print "NOT done yet Either :-)\n"; print "NOT done yet Either :-)\n";
} }
sub dump_xref{ sub get_species_id_from_species_name{
my ($self,$xref) = @_; my ($xref,$species) = @_;
if(!defined($xref->species())){ my $sql = "select species_id from species where name = '".$species."'";
$xref->species($self->species);
}
if(!defined($xref->dir())){
if(defined($self->species)){
$xref->species($self->species)
}
else{
$xref->species(".");
}
}
#
# the species specified must be in the database and hence have a species_id
#
my $sql = "select species_id from species where name = '".$xref->species."'";
my $dbi = $xref->dbi(); my $dbi = $xref->dbi();
my $sth = $dbi->prepare($sql); my $sth = $dbi->prepare($sql);
$sth->execute(); $sth->execute();
...@@ -71,68 +86,188 @@ sub dump_xref{ ...@@ -71,68 +86,188 @@ sub dump_xref{
if (defined @row) { if (defined @row) {
$species_id = $row[0]; $species_id = $row[0];
} else { } else {
print STDERR "Couldn't get ID for species ".$xref->species."\n"; print STDERR "Couldn't get ID for species ".$species."\n";
print STDERR "It must be one of :-\n"; print STDERR "It must be one of :-\n";
$sql = "select name from species"; $sql = "select name from species";
$sth = dbi()->prepare($sql); $sth = $dbi->prepare($sql);
$sth->execute(); $sth->execute();
while(my @row = $sth->fetchrow_array()){ while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n"; print STDERR $row[0]."\n";
} }
die("Please try again\n"); die("Please try again :-)\n");
} }
return $species_id;
}
#
# Dump out the sequences where the species has an id of species_id sub get_set_lists{
# and the sequence type is 'dna' my ($self) = @_;
#
$self->xref_dna_file($xref->dir."/".$xref->species."_xref_dna.fasta"); return [["method1",["homo_sapiens","RefSeq"],["homo_sapiens","UniProtSwissProt"]],
open(XDNA,">".$xref->dir."/".$xref->species."_xref_dna.fasta") || die "Could not open xref_dna.fasta"; ["method2",[$self->species,"*"]],
my $sql = "select p.xref_id, p.sequence from primary_xref p, xref x "; ["method3",["*","*"]]];
$sql .= "where p.xref_id = x.xref_id and "; }
$sql .= " p.sequence_type ='dna' and ";
$sql .= " x.species_id = ".$species_id." ";
$sth = $xref->dbi()->prepare($sql); sub get_source_id_from_source_name{
my ($xref, $source) = @_;
my $source_id;
my $sql = "select source_id from source where name = '".$source."'";
my $dbi = $xref->dbi();
my $sth = $dbi->prepare($sql);
$sth->execute();
my @row = $sth->fetchrow_array();
if (defined $row[0] and $row[0] ne '') {
$source_id = $row[0];
# print $source."\t*".$row[0]."*\n";
} else {
print STDERR "Couldn't get ID for source ".$source."\n";
print STDERR "It must be one of :-\n";
$sql = "select name from source";
$sth = $dbi->prepare($sql);
$sth->execute();
while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n";
}
die("Please try again :-)\n");
}
return $source_id;
}
sub dump_xref{
my ($self,$xref) = @_;
if(!defined($xref->dir())){
if(defined($self->dir)){
$xref->species($self->dir);
}
else{
$xref->dir(".");
}
}
my @method=();
my @lists =@{$self->get_set_lists()};
my $i=0;
foreach my $list (@lists){
print "method->".@$list[0]."\n";
$method[$i] = shift @$list;
my $j = 1;
my @source_id-();
my @species_id=();
foreach my $element (@$list){
while(my $species = shift(@$element)){
# print $j.")\t".$species."\n";
if($species ne "*"){
$species_id[$j] = get_species_id_from_species_name($xref,$species);
}
else{
$species_id[$j] = -1;
}
my $source = shift(@$element);
if($source ne "*"){
$source_id[$j] = get_source_id_from_source_name($xref,$source);
}
else{
$source_id[$j] = -1;
}
print $j."\t".$source. "\t".$source_id[$j] ."\n";
print $j."\t".$species."\t".$species_id[$j]."\n";
$j++;
}
}
#method data fully defined now
dump_subset($xref,\@species_id,\@source_id,$i);
$i++;
}
$self->method(\@method);
return;
}
sub dump_subset{
my ($xref,$rspecies_id,$rsource_id,$index) = @_;
open(XDNA,">".$xref->dir()."/xref_".$index."_dna.fasta")
|| die "Could not open xref_".$index."_dna.fasta";
my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id ";
$sql .= " from primary_xref p, xref x ";
$sql .= " where p.xref_id = x.xref_id and ";
$sql .= " p.sequence_type ='dna' ";
for (my $j =1; $j<scalar(@$rspecies_id); $j++){
print $j."\t".$$rspecies_id[$j]."\t".$$rsource_id[$j]."\n";
}
# return $xref->dir."/xref_".$i."_dna.fasta";
my $sth = $xref->dbi()->prepare($sql);
$sth->execute(); $sth->execute();
my $i = 0; my $i = 0;
while(my @row = $sth->fetchrow_array()){ while(my @row = $sth->fetchrow_array()){
$i++; my $pass = 0;
$row[1] =~ s/(.{60})/$1\n/g; for (my $j =1; $j<scalar(@$rspecies_id); $j++){
print XDNA ">".$row[0]."\n".$row[1]."\n"; if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){
if($i > 10){ if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){
goto ENDDNA; $pass = 1;
}
}
}
if($pass){
$i++;
$row[1] =~ s/(.{60})/$1\n/g;
print XDNA ">".$row[0]."\n".$row[1]."\n";
if($i > 10){
goto ENDDNA;
}
} }
} }
ENDDNA: ENDDNA:
close XDNA; close XDNA;
#
# Dump out the sequences where the species has an id of species_id open(XPRO,">".$xref->dir."/xref_".$index."_prot.fasta")
# and the sequence type is 'peptide' || die "Could not open xref_".$index."_prot.fasta";
# my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id ";
$self->xref_protein_file($xref->dir."/".$self->species."_xref_prot.fasta"); $sql .= " from primary_xref p, xref x ";
open(XPEP,">".$xref->dir."/".$self->species."_xref_prot.fasta") || die "Could not open xref_prot.fasta"; $sql .= " where p.xref_id = x.xref_id and ";
$sql = "select p.xref_id, p.sequence from primary_xref p, xref x "; $sql .= " p.sequence_type ='peptide' ";
$sql .= "where p.xref_id = x.xref_id and ";
$sql .= " p.sequence_type ='peptide' and ";
$sql .= " x.species_id = ".$species_id." ";
$sth = $xref->dbi()->prepare($sql); $sth = $xref->dbi()->prepare($sql);
$sth->execute(); $sth->execute();
$i = 0; $i = 0;
while(my @row = $sth->fetchrow_array()){ while(my @row = $sth->fetchrow_array()){
$i++; my $pass = 0;
$row[1] =~ s/(.{60})/$1\n/g; for (my $j =1; $j<scalar(@$rspecies_id); $j++){
print XPEP ">".$row[0]."\n".$row[1]."\n"; if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){
if($i > 10){ if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){
goto ENDXPEP; $pass = 1;
}
}
}
if($pass){
$i++;
$row[1] =~ s/(.{60})/$1\n/g;
print XPRO ">".$row[0]."\n".$row[1]."\n";
if($i > 10){
goto ENDPRO;
}
} }
} }
ENDXPEP: ENDPRO:
close XPEP; close XPRO;
} }
sub dump_ensembl{ sub dump_ensembl{
my ($self) = @_; my ($self) = @_;
...@@ -177,14 +312,14 @@ sub fetch_and_dump_seq{ ...@@ -177,14 +312,14 @@ sub fetch_and_dump_seq{
foreach my $gene_id (@gene_ids){ foreach my $gene_id (@gene_ids){
$i++; $i++;
my $gene = $gene_adap->fetch_by_dbID($gene_id); my $gene = $gene_adap->fetch_by_dbID($gene_id);
print "gene ".$gene."\n"; # print "gene ".$gene."\n";
foreach my $transcript (@{$gene->get_all_Transcripts()}) { foreach my $transcript (@{$gene->get_all_Transcripts()}) {
my $seq = $transcript->spliced_seq(); my $seq = $transcript->spliced_seq();
$seq =~ s/(.{60})/$1\n/g; $seq =~ s/(.{60})/$1\n/g;
print DNA ">" . $transcript->dbID() . "\n" .$seq."\n"; print DNA ">" . $transcript->dbID() . "\n" .$seq."\n";
my $trans = $transcript->translation(); my $trans = $transcript->translation();
my $translation = $transcript->translate(); my $translation = $transcript->translate();
print "tranlation ".$translation."\n"; # print "tranlation ".$translation."\n";
my $pep_seq = $translation->seq(); my $pep_seq = $translation->seq();
$pep_seq =~ s/(.{60})/$1\n/g; $pep_seq =~ s/(.{60})/$1\n/g;
print PEP ">".$trans->dbID()."\n".$pep_seq."\n"; print PEP ">".$trans->dbID()."\n".$pep_seq."\n";
...@@ -236,6 +371,14 @@ sub ensembl_dna_file{ ...@@ -236,6 +371,14 @@ sub ensembl_dna_file{
return $self->{_ens_dna_file}; return $self->{_ens_dna_file};
} }
sub method{
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_method} = $arg );
return $self->{_method};
}
#sub get_ensembl_type{ #sub get_ensembl_type{
# my %type; # my %type;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment