Skip to content
Snippets Groups Projects
Commit 3841d1bd authored by Laura Clarke's avatar Laura Clarke
Browse files

decided to put this stuff in pipeline
parent 69ee1ecc
No related branches found
No related tags found
No related merge requests found
#!/usr/local/ensembl/bin/perl -w
# mouse agp -> ensembl sgp
$| = 1;
#print STDERR "have entered script\n";
my $agp = shift or die;
my $raw = shift or die;
my $chromosome = shift or die;
my @sgp;
my %chr_id;
my %ctg;
open RAW, "< $raw" or die "couldn't open file ".$raw." @!";
open AGP, "< $agp" or die;
open CHR, "< $chromosome" or die;
while (<RAW>) {
chomp;
#print;
#print "\n";
my ($contig_id, $iid, $length) = split;
#print "have ".$contig_id." ".$iid." ".$length."\n";
#AL031185.2.1.4059
#my ($id) = $contig_id =~ /(\w+\.\d+)\.\d+\.\d+/; # strip off leading 'sc<date>_'
$ctg{$contig_id} = [ $iid, $length ];
}
while(<CHR>){
chomp;
my ($id, $name) = split;
#print "have ".$id." name ".$name."\n";
$chr_id{$name} = $id;
}
while (<AGP>) {
chomp;
#I 47490 107680 3 F AC024796.1 1 60191 +
#print;
#print "\n";
my ($chr, $chr_start, $chr_end, $gap, $contig, $raw_start, $raw_end, $raw_ori) =
(split)[0, 1, 2, 4, 5, 6, 7, 8];
#next if $raw_start eq 'fragment';
#next if $raw_start eq 'clone';
#next if $raw_start eq 'contig';
if($gap eq 'N'){
next;
}
if($contig eq '.'){
next;
}
#$chr =~ s/chr//;
#next if $chr eq 'NA_contam';
#print $chr." start ".$chr_start." end ".$chr_end." contig ".$contig." start ".$raw_start." end ".$raw_end." ori ".$raw_ori."\n";
if ($raw_ori eq '+') {
$raw_ori = 1;
}
elsif ($raw_ori eq '-') {
$raw_ori = -1;
}
else {
$raw_ori = 1;
#print "$chr Contig $contig $chr_start \n";
#print "Warning assumed default orientation for $contig\n";
}
my $chromosome_id = $chr_id{$chr};
my $ctg_id = $ctg{$contig}->[0];
die "Don't have raw contig for $contig\n" unless $ctg_id;
#print $contig."\n" if(!$ctg_id);
#my $ctg_start = $ctg{$contig}->[1];
my $ctg_end = $ctg{$contig}->[1];
if($raw_end > $ctg_end){
next;
#warn "contig ".$contig." length ".$ctg_end." is less than agp length ".$raw_end." won't work\n";
}
#if ($raw_start != $ctg_start || $raw_start != $ctg_start) {
# raw coords from AGP assumed to be from 1 to length of contig
# print "Warning: my assumption about contig coords for $contig is wrong\n";
# }
# $sgp[0] = qq{};
$sgp[0] = $chromosome_id; # fpc = contig id
$sgp[1] = $chr_start;
$sgp[2] = $chr_end;
$sgp[3] = $chr;
$sgp[4] = $chr_start;
$sgp[5] = $chr_end; # fpc start
$sgp[6] = 1; # fpc end
$sgp[7] = $ctg_id; # fpc ori
$sgp[8] = $raw_start; # raw start
$sgp[9] = $raw_end; # raw end
$sgp[10] = $raw_ori;
$sgp[11] = qq{elegans_90};
print join("\t", @sgp), "\n";
}
close RAW;
close AGP;
close CHR;
#!/usr/local/bin/perl -w
use strict;
use vars qw($USER $PASS $DB $HOST $DSN);
use Bio::EnsEMBL::DBLoader;
use Bio::SeqIO;
use Bio::EnsEMBL::RawContig;
use Bio::EnsEMBL::Clone;
use Getopt::Long;
my $dbtype = 'rdb';
my $host = 'ecs1d';
my $port = '';
my $dbname = 'elegans_newschema';
my $dbuser = 'ecs1dadmin';
my $dbpass = 'TyhRv';
my $module = 'Bio::EnsEMBL::DBSQL::DBAdaptor';
$| = 1;
my $help = 0;
my $pipe = 1;
my $verbose = 0;
&GetOptions(
'dbtype:s' => \$dbtype,
'host:s' => \$host,
'port:n' => \$port,
'dbname:s' => \$dbname,
'dbuser:s' => \$dbuser,
'dbpass:s' => \$dbpass,
'module:s' => \$module,
'pipe' => \$pipe,
'verbose' => \$verbose,
'h|help' => \$help
);
if ($help) {
exec('perldoc', $0);
}
$SIG{INT} = sub {my $sig=shift;die "exited after SIG$sig";};
my $locator;
if ($dbpass) {
$locator = "$module/host=$host;port=;dbname=$dbname;user=$dbuser;pass=$dbpass";
} else {
$locator = "$module/host=$host;port=;dbname=$dbname;user=$dbuser;pass=";
}
my $db = Bio::EnsEMBL::DBLoader->new($locator);
my ($seqfile) = shift;
my ($contig_id_file) = shift;
open(FH, $contig_id_file);
my %seq_got;
if( !defined $seqfile ) { die 'cannot load because sequence file to load sequences from was not passed in as argument';}
my $seqio = new Bio::SeqIO(-format => 'Fasta',
-file => $seqfile);
my $count = 0;
while( my $seq = $seqio->next_seq ) {
my @clone_ids = split / /, $seq->desc;
#print $seq->id."\n";
#print $seq->desc."\n";
if($seq->seq =~ /match/i){
die("seq = ".$seq->seq."\n");
}
if($seq->seq =~ /abort/i){
die("seq = ".$seq->seq."\n");
}
my $clone = new Bio::EnsEMBL::Clone;
my $contig = new Bio::EnsEMBL::RawContig;
my ($version) = $clone_ids[0] =~ /\S+\.(\d+)/;
$seq_got{$clone_ids[0]} = 1;
$clone->htg_phase(4);
$clone->id($clone_ids[0]);
$clone->embl_id($clone_ids[0]);
$clone->version(1);
$clone->embl_version($version);
$clone->adaptor($db->get_CloneAdaptor);
my $contig_id = $clone_ids[0].".1.".$seq->length;
$contig->name($contig_id);
$contig->dbID($count++);
$contig->seq($seq->seq);
$contig->length($seq->length);
$contig->adaptor($db->get_RawContigAdaptor);
$clone->add_Contig($contig);
my $time = time;
$clone->created($time);
$clone->modified($time);
#print "have clone with ".$clone->id." contig ".$contig->id."\n";
eval {
$db->get_CloneAdaptor->store($clone);
$verbose && print STDERR "Written ".$clone->id." scaffold into db\n";
};
if( $@ ) {
print STDERR "Could not write clone into database, error was $@\n";
}
}
while(<FH>){
chomp;
my $name = $_;
if(!$seq_got{$name}){
print "don't seem to have sequence for ".$name."\n";
}
}
close(FH);
#!/usr/local/ensembl/bin/perl -w
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
$| = 1;
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => "ecs1d",
-user => "ecs1dadmin",
-dbname => "elegans_newschema",
-pass => "TyhRv",
);
my %chromosome;
my $sql = "select name, length from chromosome";
my $sth = $db->prepare($sql);
$sth->execute;
my $slice_adaptor = $db->get_SliceAdaptor();
while(my ($name, $length) = $sth->fetchrow){
my $slice = $slice_adaptor->fetch_by_chr_start_end($name, 1, $length);
$chromosome{$name} = $slice;
}
foreach my $name(keys(%chromosome)){
my @genes;
my $slice = $chromosome{$name};
#$sql = 'select gene_id from gene';
#$sth = $db->prepare($sql);
#$sth->execute;
#while(my ($gene_id) = $sth->fetchrow){
# my $gene = $db->get_GeneAdaptor->fetch_by_dbID($gene_id);
# push(@genes, $gene);
#}
@genes = @{$slice->get_all_Genes};
#print STDERR "there are ".@genes." genes\n";
my $sticky_count = 0;
my $non_translate = 0;
foreach my $gene(@genes){
#$sticky_count += &sticky_check($gene);
$non_translate += &translation_check($gene);
#my ($transcript) = @{$gene->get_all_Transcripts};
#if($translate){
# my @exons = @{$transcript->get_all_Exons};
#&display_exons(@exons);
#&non_translate($transcript);
#}
}
#print "there are ".$sticky_count." transcripts containing sticky_exons\n";
print "there are ".$non_translate." non translating transcripts in ".$name."\n";
}
sub sticky_check{
my ($gene) = @_;
my $count = 0;
my $sticky = 0;
my @transcripts = @{$gene->get_all_Transcripts};
foreach my $t(@transcripts){
foreach my $e(@{$t->get_all_Exons}){
if($e->isa("Bio::EnsEMBL::StickyExon")){
$sticky = 1;
}
}
if($sticky){
$count++;
}
$sticky = 0;
}
return $count;
}
sub translation_check{
my ($gene) = @_;
my $count = 0;
my @transcripts = @{$gene->get_all_Transcripts};
foreach my $t(@transcripts){
my $pep = $t->translate->seq;
if($pep =~ /\*/){
print STDERR "\ntranscript ".$t->stable_id." doesn't translate: \n\n";
$count++;
}
}
return $count;
}
sub display_exons{
my (@exons) = @_;
@exons = sort{$a->start <=> $b->start || $a->end <=> $b->end} @exons;
if($exons[0]->can ('hseqname')){
foreach my $e(@exons){
print $e->hseqname."\t ".$e->start."\t ".$e->end."\t ".$e->strand."\t ".$e->phase."\t ".$e->end_phase."\n";
}
}else{
foreach my $e(@exons){
print $e->seqname."\t ".$e->start."\t ".$e->end."\t ".$e->strand."\t ".$e->phase."\t ".$e->end_phase."\n";
}
}
}
sub non_translate{
my (@transcripts) = @_;
foreach my $t(@transcripts){
my @exons = @{$t->get_all_Exons};
# print "transcript sequence :\n".$t->seq."\n";
foreach my $e(@exons){
my $seq = $e->seq;
my $pep0 = $seq->translate('*', 'X', 0);
my $pep1 = $seq->translate('*', 'X', 1);
my $pep2 = $seq->translate('*', 'X', 2);
print "exon sequence :\n".$e->seq->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 0 frame\n ".$pep0->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 1 phase\n ".$pep2->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 2 phase\n ".$pep1->seq."\n\n";
print "\n\n";
}
}
}
#!/usr/local/ensembl/bin/perl -w
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Translation;
use Bio::EnsEMBL::Exon;
$| = 1;
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => "ecs1d",
-user => "ecs1dadmin",
-dbname => "elegans_newschema",
-pass => "TyhRv",
);
#CHROMOSOME_I curated CDS 11641 11689 . + 0 Sequence "Y74C9A.2"
#CHROMOSOME_I curated CDS 14951 15160 . + 2 Sequence "Y74C9A.2"
#CHROMOSOME_I curated CDS 16473 16585 . + 2 Sequence "Y74C9A.2"
#CHROMOSOME_I curated CDS 43733 43961 . + 0 Sequence "Y74C9A.1"
#CHROMOSOME_I curated CDS 44030 44234 . + 2 Sequence "Y74C9A.1"
#CHROMOSOME_I curated CDS 44281 44328 . + 1 Sequence "Y74C9A.1"
#CHROMOSOME_I curated CDS 44521 44677 . + 1 Sequence "Y74C9A.1"
#CHROMOSOME_I curated CDS 49921 50016 . + 0 Sequence "Y48G1C.4"
#CHROMOSOME_I curated CDS 50815 51030 . + 0 Sequence "Y48G1C.4"
#CHROMOSOME_I curated CDS 52283 52410 . + 0 Sequence "Y48G1C.4"
#CHROMOSOME_I curated CDS 52466 52572 . + 1 Sequence "Y48G1C.4"
my %chromosome;
my $sql = "select name, length from chromosome";
my $sth = $db->prepare($sql);
$sth->execute;
my $slice_adaptor = $db->get_SliceAdaptor();
while(my ($name, $length) = $sth->fetchrow){
my $slice = $slice_adaptor->fetch_by_chr_start_end($name, 1, $length);
$chromosome{$name} = $slice;
}
my $analysis_adaptor = $db->get_AnalysisAdaptor();
my $analysis = $analysis_adaptor->fetch_by_logic_name('wormbase');
my $forward_count = 0;
my $reverse_count = 0;
my $count = 0;
my %genes;
my @stored;
my $file = shift;
open(FH, $file) or die "couldn't open $file $!";
my $transcripts = &process_file(\*FH);
my $processed_transcripts = &process_transcripts($transcripts, $analysis);
#print "there are ".keys(%$processed_transcripts)." transcript\n";
my $genes = &create_transcripts($processed_transcripts);
#print "there are ".keys(%$genes)." genes\n";
my $gene_adaptor = $db->get_GeneAdaptor;
my @genes;
foreach my $gene_id(keys(%$genes)){
my $db_id;
my $transcripts = $genes->{$gene_id};
my $unpruned = &create_gene($transcripts, $gene_id);
## print STDERR "gene ".$unpruned."\n";
my $gene = &prune_Exons($unpruned);
push(@genes, $gene);
#eval{
# $gene->transform;
#};
foreach my $transcript (@{$gene->get_all_Transcripts}){
print "transcript ".$transcript->stable_id."\n";
my @exons = @{$transcript->get_all_Exons};
if($exons[0]->strand == -1){
@exons = sort{$b->start <=> $a->start} @exons;
}else{
@exons = sort{$a->start <=> $b->start} @exons;
}
foreach my $exon(@exons){
print "exon ".$exon->stable_id."\t".$exon->seqname."\t ".$exon->start."\t ".$exon->end."\t ".$exon->strand."\t ".$exon->phase."\t ".$exon->end_phase."\n";
}
}
# if($@){
# warn "couldn't transform ".$gene." $@\n";
# next;
# }
# eval{
# #$gene_adaptor->store($gene);
# };
# if($@){
# warn "couldn't store ".$gene."\n";
# next;
# }
# $db_id = $gene->dbID;
# print STDERR "have stored gene ".$gene->dbID."\n";
# #my $stored_gene = $gene_adaptor->fetch_by_dbID($db_id);
# #&translation_check($stored_gene);
}
print "there are ".@genes." genes in wormbase 90\n";
sub process_file{
my ($fh) = @_;
my %transcripts;
LOOP: while(<$fh>){
# print;
chomp;
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split;
my $element = $_;
if(!$status && !$type){
#print "status and type no defined skipping\n";
next LOOP;
}
my $line = $status." ".$type;
# print "line ".$line."\n";
if($line ne 'curated CDS'){
next LOOP;
}
$gene =~ s/\"//g;
if(!$transcripts{$gene}){
$transcripts{$gene} = [];
push(@{$transcripts{$gene}}, $element);
}else{
push(@{$transcripts{$gene}}, $element);
}
}
return \%transcripts;
}
sub process_transcripts{
my ($transcripts, $analysis) = @_;
my %genes;
my %transcripts = %$transcripts;
my @names = keys(%transcripts);
#print STDERR "PROCESSING TRANSCRIPTS \n";
foreach my $name(@names){
my @lines = @{$transcripts{$name}};
$transcripts{$name} = [];
my @exons;
foreach my $line(@lines){
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split /\s+/, $line;
$chr =~ s/CHROMOSOME_//;
if($start == $end){
next;
}
my $slice = $chromosome{$chr};
my $exon = new Bio::EnsEMBL::Exon;
my $phase = (3 - $frame)%3;
#print "have ".$phase." phase\n";
$exon->start($start);
$exon->end($end);
$exon->analysis($analysis);
$exon->contig($slice);
$exon->phase($phase);
my $end_phase = ($phase + ($exon->end-$exon->start) + 1)%3;
$exon->end_phase($end_phase);
if($strand eq '+'){
$exon->strand(1);
}else{
$exon->strand(-1);
}
$exon->score(100);
push(@exons, $exon);
}
if($exons[0]->strand == -1){
@exons = sort{$b->start <=> $a->start} @exons;
}else{
@exons = sort{$a->start <=> $b->start} @exons;
}
#print STDERR $name." transcript\n";
my $phase = 0;
foreach my $e(@exons){
#print "phase ".$phase."\n";
#$e->phase($phase);
#my $end_phase = ($phase + ($e->end-$e->start) + 1)%3;
#$e->end_phase($end_phase);
#print "end phase ".$end_phase."\n";
#$phase = $end_phase;
push(@{$transcripts{$name}}, $e);
}
}
#print STDERR "FINISHED PROCESSING TRANSCRIPTS\n";
return \%transcripts;
}
sub create_transcripts{
my ($transcripts) = @_;
#print STDERR "CREATING TRANSCRIPTS\n";
my %transcripts = %$transcripts;
my @non_translate;
my %genes;
my $gene_name;
my $transcript_id;
foreach my $transcript(keys(%transcripts)){
my $time = time;
#print STDERR "transcript ".$transcript." \n";
my @exons = @{$transcripts{$transcript}};
if($transcript =~ /\w+\.\d+\w+/){
($gene_name) = $transcript =~ /(\w+\.\d+)\w+/;
$transcript_id = $transcript;
}else{
$gene_name = $transcript;
$transcript_id = $transcript;
}
my $transcript = new Bio::EnsEMBL::Transcript;
my $translation = new Bio::EnsEMBL::Translation;
my @sorted_exons;
if($exons[0]->strand == 1){
@sorted_exons = sort{$a->start <=> $b->start} @exons
}else{
@sorted_exons = sort{$b->start <=> $a->start} @exons
}
my $exon_count = 1;
my $phase = 0;
foreach my $exon(@sorted_exons){
$exon->created($time);
$exon->modified($time);
$exon->version(1);
$exon->stable_id($transcript_id.".".$exon_count);
#$exon->phase($phase);
#my $end_phase = ($phase + ($end-$start) + 1)%3;
#$exon->end_phase($end_phase);
#$phase = $end_phase;
$exon_count++;
$transcript->add_Exon($exon);
}
$translation->start_Exon($sorted_exons[0]);
$translation->end_Exon ($sorted_exons[$#sorted_exons]);
if ($sorted_exons[0]->phase == 0) {
$translation->start(1);
} elsif ($sorted_exons[0]->phase == 1) {
$translation->start(3);
} elsif ($sorted_exons[0]->phase == 2) {
$translation->start(2);
}
$translation->end ($sorted_exons[$#sorted_exons]->end - $sorted_exons[$#sorted_exons]->start + 1);
# $translation->version(1);
$translation->stable_id($transcript_id);
$transcript->translation($translation);
$transcript->version(1);
$transcript->stable_id($transcript_id);
# my $mrna = $transcript->translateable_seq;
# my $last_codon = substr($mrna, -3);
# if($last_codon eq 'TAG' || $last_codon eq 'TGA' || $last_codon eq 'TAA'){
# my $translation_end = ($sorted_exons[$#sorted_exons]->end - $sorted_exons[$#sorted_exons]->start + 1)-3;
# if($translation_end <= 0){
# my $exon_number = $#sorted_exons
# }
# $translation->end (($sorted_exons[$#sorted_exons]->end - $sorted_exons[$#sorted_exons]->start + 1)-3);
# }
my $peptide = $transcript->translate->seq;
if($peptide =~ /\*./){
# warn "transcript ".$transcript->stable_id." doesn't translate and won't be used\n";
# push(@non_translate, $transcript);
# next;
}
#print STDERR "transcript ".$transcript->stable_id." has ".@{$transcript->get_all_Exons}." exons and produces and translation: ".$peptide."\n";
my @stored_exons = @{$transcript->get_all_Exons};
#&display_exons(@stored_exons);
if(!$genes{$gene_name}){
$genes{$gene_name} = [];
push(@{$genes{$gene_name}}, $transcript);
}else{
push(@{$genes{$gene_name}}, $transcript);
}
}
foreach my $t(@non_translate){
print STDERR "transcript ".$t->stable_id." has ".@{$t->get_all_Exons}." exons\n";
my @exons = @{$t->get_all_Exons};
&display_exons(@exons);
&non_translate($t);
}
#print STDERR "FINISHED CREATEING TRANSCRIPTS\n";
return \%genes;
}
sub create_gene{
my ($transcripts, $name) = @_;
#print STDERR "CREATING GENE\n";
my $time = time;
my $gene = new Bio::EnsEMBL::Gene;
my $exons = $transcripts->[0]->get_all_Exons;
my $analysis = $exons->[0]->analysis;
$gene->analysis($analysis);
$gene->type($analysis->logic_name);
$gene->created($time);
$gene->modified($time);
$gene->version(1);
$gene->stable_id($name);
foreach my $transcript(@$transcripts){
$gene->add_Transcript($transcript);
}
return $gene;
}
sub prune_Exons {
my ($gene) = @_;
my @unique_Exons;
# keep track of all unique exons found so far to avoid making duplicates
# need to be very careful about translation->start_Exon and translation->end_Exon
foreach my $tran (@{$gene->get_all_Transcripts}) {
my @newexons;
foreach my $exon (@{$tran->get_all_Exons}) {
my $found;
#always empty
UNI:foreach my $uni (@unique_Exons) {
if ($uni->start == $exon->start &&
$uni->end == $exon->end &&
$uni->strand == $exon->strand &&
$uni->phase == $exon->phase &&
$uni->end_phase == $exon->end_phase
) {
$found = $uni;
last UNI;
}
}
if (defined($found)) {
push(@newexons,$found);
if ($exon == $tran->translation->start_Exon){
$tran->translation->start_Exon($found);
}
if ($exon == $tran->translation->end_Exon){
$tran->translation->end_Exon($found);
}
} else {
push(@newexons,$exon);
push(@unique_Exons, $exon);
}
}
$tran->flush_Exon;
foreach my $exon (@newexons) {
$tran->add_Exon($exon);
}
}
return $gene;
}
sub translation_check{
my ($gene) = @_;
my @transcripts = @{$gene->get_all_Transcripts};
foreach my $t(@transcripts){
my $pep = $t->translate->seq;
if($pep =~ /\*/){
print STDERR "transcript ".$t->stable_id." doesn't translate\n";
}
}
}
sub display_exons{
my (@exons) = @_;
@exons = sort{$a->start <=> $b->start || $a->end <=> $b->end} @exons;
if($exons[0]->can ('hseqname')){
foreach my $e(@exons){
print $e->hseqname."\t ".$e->start."\t ".$e->end."\t ".$e->strand."\t ".$e->phase."\t ".$e->end_phase."\n";
}
}else{
foreach my $e(@exons){
print $e->seqname."\t ".$e->start."\t ".$e->end."\t ".$e->strand."\t ".$e->phase."\t ".$e->end_phase."\n";
}
}
}
sub non_translate{
my (@transcripts) = @_;
foreach my $t(@transcripts){
my @exons = @{$t->get_all_Exons};
# print "transcript sequence :\n".$t->seq."\n";
foreach my $e(@exons){
my $seq = $e->seq;
my $pep0 = $seq->translate('*', 'X', 0);
my $pep1 = $seq->translate('*', 'X', 1);
my $pep2 = $seq->translate('*', 'X', 2);
print "exon sequence :\n".$e->seq->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 0 frame\n ".$pep0->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 1 phase\n ".$pep2->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 2 phase\n ".$pep1->seq."\n\n";
print "\n\n";
}
}
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment