Skip to content
Snippets Groups Projects
Commit 2bf1df17 authored by Ian Longden's avatar Ian Longden
Browse files

xref sequence dumps are now configurable. See get_set_lists method

parent 6ca670c0
No related branches found
No related tags found
No related merge requests found
......@@ -33,36 +33,51 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk
sub dump_seqs{
my ($self, $xref) = @_;
$self->dump_ensembl();
$self->dump_xref($xref);
$self->dump_ensembl();
}
sub run_matching{
my ($self,$xref) = @_;
my @list=();
my $i = 0;
foreach my $method (@{$self->method()}){
my @dna=();
push @dna, $method;
push @dna, $xref->dir."/xref_".$i."_dna.fasta";
push @dna, $self->ensembl_dna_file();
push @list, \@dna;
my @pep=();
push @pep, $method;
push @pep, $xref->dir."/xref_".$i."_prot.fasta";
push @pep, $self->ensembl_protein_file();
push @list, \@pep;
$i++;
}
glenn_call(\@list);
print "NOT done yet:-)\n";
}
sub glenn_call{
my ($lists) = @_;
foreach my $list (@$lists){
my ($meth, $q ,$t) = @$list;
print "HELLO: ".$meth."\n\t".$q."\n\t".$t."\n\n";
}
}
sub store{
print "NOT done yet Either :-)\n";
}
sub dump_xref{
my ($self,$xref) = @_;
sub get_species_id_from_species_name{
my ($xref,$species) = @_;
if(!defined($xref->species())){
$xref->species($self->species);
}
if(!defined($xref->dir())){
if(defined($self->species)){
$xref->species($self->species)
}
else{
$xref->species(".");
}
}
#
# the species specified must be in the database and hence have a species_id
#
my $sql = "select species_id from species where name = '".$xref->species."'";
my $sql = "select species_id from species where name = '".$species."'";
my $dbi = $xref->dbi();
my $sth = $dbi->prepare($sql);
$sth->execute();
......@@ -71,68 +86,188 @@ sub dump_xref{
if (defined @row) {
$species_id = $row[0];
} else {
print STDERR "Couldn't get ID for species ".$xref->species."\n";
print STDERR "Couldn't get ID for species ".$species."\n";
print STDERR "It must be one of :-\n";
$sql = "select name from species";
$sth = dbi()->prepare($sql);
$sth = $dbi->prepare($sql);
$sth->execute();
while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n";
}
die("Please try again\n");
die("Please try again :-)\n");
}
return $species_id;
}
#
# Dump out the sequences where the species has an id of species_id
# and the sequence type is 'dna'
#
$self->xref_dna_file($xref->dir."/".$xref->species."_xref_dna.fasta");
open(XDNA,">".$xref->dir."/".$xref->species."_xref_dna.fasta") || die "Could not open xref_dna.fasta";
my $sql = "select p.xref_id, p.sequence from primary_xref p, xref x ";
$sql .= "where p.xref_id = x.xref_id and ";
$sql .= " p.sequence_type ='dna' and ";
$sql .= " x.species_id = ".$species_id." ";
$sth = $xref->dbi()->prepare($sql);
sub get_set_lists{
my ($self) = @_;
return [["method1",["homo_sapiens","RefSeq"],["homo_sapiens","UniProtSwissProt"]],
["method2",[$self->species,"*"]],
["method3",["*","*"]]];
}
sub get_source_id_from_source_name{
my ($xref, $source) = @_;
my $source_id;
my $sql = "select source_id from source where name = '".$source."'";
my $dbi = $xref->dbi();
my $sth = $dbi->prepare($sql);
$sth->execute();
my @row = $sth->fetchrow_array();
if (defined $row[0] and $row[0] ne '') {
$source_id = $row[0];
# print $source."\t*".$row[0]."*\n";
} else {
print STDERR "Couldn't get ID for source ".$source."\n";
print STDERR "It must be one of :-\n";
$sql = "select name from source";
$sth = $dbi->prepare($sql);
$sth->execute();
while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n";
}
die("Please try again :-)\n");
}
return $source_id;
}
sub dump_xref{
my ($self,$xref) = @_;
if(!defined($xref->dir())){
if(defined($self->dir)){
$xref->species($self->dir);
}
else{
$xref->dir(".");
}
}
my @method=();
my @lists =@{$self->get_set_lists()};
my $i=0;
foreach my $list (@lists){
print "method->".@$list[0]."\n";
$method[$i] = shift @$list;
my $j = 1;
my @source_id-();
my @species_id=();
foreach my $element (@$list){
while(my $species = shift(@$element)){
# print $j.")\t".$species."\n";
if($species ne "*"){
$species_id[$j] = get_species_id_from_species_name($xref,$species);
}
else{
$species_id[$j] = -1;
}
my $source = shift(@$element);
if($source ne "*"){
$source_id[$j] = get_source_id_from_source_name($xref,$source);
}
else{
$source_id[$j] = -1;
}
print $j."\t".$source. "\t".$source_id[$j] ."\n";
print $j."\t".$species."\t".$species_id[$j]."\n";
$j++;
}
}
#method data fully defined now
dump_subset($xref,\@species_id,\@source_id,$i);
$i++;
}
$self->method(\@method);
return;
}
sub dump_subset{
my ($xref,$rspecies_id,$rsource_id,$index) = @_;
open(XDNA,">".$xref->dir()."/xref_".$index."_dna.fasta")
|| die "Could not open xref_".$index."_dna.fasta";
my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id ";
$sql .= " from primary_xref p, xref x ";
$sql .= " where p.xref_id = x.xref_id and ";
$sql .= " p.sequence_type ='dna' ";
for (my $j =1; $j<scalar(@$rspecies_id); $j++){
print $j."\t".$$rspecies_id[$j]."\t".$$rsource_id[$j]."\n";
}
# return $xref->dir."/xref_".$i."_dna.fasta";
my $sth = $xref->dbi()->prepare($sql);
$sth->execute();
my $i = 0;
while(my @row = $sth->fetchrow_array()){
$i++;
$row[1] =~ s/(.{60})/$1\n/g;
print XDNA ">".$row[0]."\n".$row[1]."\n";
if($i > 10){
goto ENDDNA;
my $pass = 0;
for (my $j =1; $j<scalar(@$rspecies_id); $j++){
if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){
if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){
$pass = 1;
}
}
}
if($pass){
$i++;
$row[1] =~ s/(.{60})/$1\n/g;
print XDNA ">".$row[0]."\n".$row[1]."\n";
if($i > 10){
goto ENDDNA;
}
}
}
ENDDNA:
ENDDNA:
close XDNA;
#
# Dump out the sequences where the species has an id of species_id
# and the sequence type is 'peptide'
#
$self->xref_protein_file($xref->dir."/".$self->species."_xref_prot.fasta");
open(XPEP,">".$xref->dir."/".$self->species."_xref_prot.fasta") || die "Could not open xref_prot.fasta";
$sql = "select p.xref_id, p.sequence from primary_xref p, xref x ";
$sql .= "where p.xref_id = x.xref_id and ";
$sql .= " p.sequence_type ='peptide' and ";
$sql .= " x.species_id = ".$species_id." ";
open(XPRO,">".$xref->dir."/xref_".$index."_prot.fasta")
|| die "Could not open xref_".$index."_prot.fasta";
my $sql = "select p.xref_id, p.sequence, x.species_id , x.source_id ";
$sql .= " from primary_xref p, xref x ";
$sql .= " where p.xref_id = x.xref_id and ";
$sql .= " p.sequence_type ='peptide' ";
$sth = $xref->dbi()->prepare($sql);
$sth->execute();
$i = 0;
while(my @row = $sth->fetchrow_array()){
$i++;
$row[1] =~ s/(.{60})/$1\n/g;
print XPEP ">".$row[0]."\n".$row[1]."\n";
if($i > 10){
goto ENDXPEP;
my $pass = 0;
for (my $j =1; $j<scalar(@$rspecies_id); $j++){
if($$rspecies_id[$j] < 0 or $row[2] == $$rspecies_id[$j]){
if($$rsource_id[$j] < 0 or $row[3] == $$rsource_id[$j]){
$pass = 1;
}
}
}
if($pass){
$i++;
$row[1] =~ s/(.{60})/$1\n/g;
print XPRO ">".$row[0]."\n".$row[1]."\n";
if($i > 10){
goto ENDPRO;
}
}
}
ENDXPEP:
close XPEP;
ENDPRO:
close XPRO;
}
sub dump_ensembl{
my ($self) = @_;
......@@ -177,14 +312,14 @@ sub fetch_and_dump_seq{
foreach my $gene_id (@gene_ids){
$i++;
my $gene = $gene_adap->fetch_by_dbID($gene_id);
print "gene ".$gene."\n";
# print "gene ".$gene."\n";
foreach my $transcript (@{$gene->get_all_Transcripts()}) {
my $seq = $transcript->spliced_seq();
$seq =~ s/(.{60})/$1\n/g;
print DNA ">" . $transcript->dbID() . "\n" .$seq."\n";
my $trans = $transcript->translation();
my $translation = $transcript->translate();
print "tranlation ".$translation."\n";
# print "tranlation ".$translation."\n";
my $pep_seq = $translation->seq();
$pep_seq =~ s/(.{60})/$1\n/g;
print PEP ">".$trans->dbID()."\n".$pep_seq."\n";
......@@ -236,6 +371,14 @@ sub ensembl_dna_file{
return $self->{_ens_dna_file};
}
sub method{
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_method} = $arg );
return $self->{_method};
}
#sub get_ensembl_type{
# my %type;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment