From 3a3dd37c9a391f47cde0faf17e7c1264458737ea Mon Sep 17 00:00:00 2001 From: Emmanuel Mongin <mongin@sanger.ac.uk> Date: Thu, 2 Jan 2003 11:43:47 +0000 Subject: [PATCH] Added a loading script for Ewan --- misc-scripts/anopheles_scripts/load_dna.pl | 271 +++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 misc-scripts/anopheles_scripts/load_dna.pl diff --git a/misc-scripts/anopheles_scripts/load_dna.pl b/misc-scripts/anopheles_scripts/load_dna.pl new file mode 100644 index 0000000000..f878dcd624 --- /dev/null +++ b/misc-scripts/anopheles_scripts/load_dna.pl @@ -0,0 +1,271 @@ +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::SeqIO; +use Bio::EnsEMBL::DBSQL::ProteinAdaptor; + +use Bio::EnsEMBL::Utils::Eprof('eprof_start','eprof_end','eprof_dump'); + +my $host = 'ecs1b'; +my $dbuser = 'ensadmin'; +my $dbname = 'drosophila_melanogaster_9_3'; +my $dbpass = 'ensembl'; +my $path = 'FLYBASE'; + +print STDERR "Connecting to $host, $dbname\n"; + + +my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( + '-host' => $host, + '-user' => $dbuser, + '-dbname' => $dbname, + '-pass' => $dbpass, + ); + +my ($dna) = @ARGV; + +my $in = Bio::SeqIO->new(-file => $dna, '-format' =>'Fasta'); +my $count = 1; +my $total = 0; +while( (my $seq = $in->next_seq ) ) { + my $internal_count = 1; + $total++; + my $sequence = $seq->seq; + my $length = length($sequence); + my $ac = $seq->id; + + my $clone = $total; + my $chr_id = $clone; + +#Load chromosome table + my $chrsth = $db->prepare('insert into chromosome (chromosome_id,name,length) values (?,?,?)'); + $chrsth->execute( + $chr_id, + $ac, + $length + ); + + +#Load clone table + my $sth = $db->prepare('insert into clone (clone_id,name, embl_acc, version, embl_version, htg_phase, created, modified) values(?, ?, ?, ?, ?, ?,NOW(), NOW())'); + + + $sth->execute( + $clone, + $ac, + 'NULL', + 1, + 0, + 3 + ); + + + my $div = int($length/25000); + my $l = int ($length/$div); + + print STDERR "AC: $ac\tDIV: $div\tL: $l\n"; + + my $prev_end; + + while ($internal_count <= $div) { + + my $total_length; + if ($internal_count == 1) { + + my $actmp = $ac."_1"; + my $t = $l; +# print STDERR "AC: $actmp\nAC_CONTIG: $count\nDIV: $count\n"; +# print STDERR "$actmp\t$prev_end\t$length\n"; + + my $subseq = $seq->subseq(1,$l); + my $subseql = length($subseq); + $total_length = $total_length + $subseql; + + print STDERR "SUB: $subseql\tL: $l\n"; + +#Load DNA table + my $statement = $db->prepare(" + insert into dna(sequence,created) + values(?, NOW()) + "); + + my $rv = $statement->execute($subseq); + + +#Load contig table + + my $sth = $db->prepare(" + insert into contig(name, contig_id, dna_id, length, clone_id, embl_offset) + values(?, ?, ?, ?, ?, ?) + "); + + my $rv = $sth->execute( + $actmp, + $count, + $count, + $subseql, + $clone, + 1, + ); + +#Load the assembly table + + my $sth = $db->prepare("insert into assembly (chromosome_id,chr_start,chr_end,superctg_name,superctg_start,superctg_end,superctg_ori,contig_id,contig_start,contig_end,contig_ori,type) values (?,?,?,?,?,?,?,?,?,?,?,?)"); + $sth->execute( + $chr_id, + 1, + $t, + "FPC_".$ac, + 1, + $t, + 1, + $count, + 1, + $t, + 1, + "FLYBASE" + + ); + + $count++; + $prev_end = $l+1; + $internal_count++; + } + + + + if (($internal_count > 1) && ($internal_count < $div)) { + my $end = $prev_end + $l; + my $subseq = $seq->subseq($prev_end,$end); + my $subseql = length($subseq); + + my $t = $l + 1; + + $total_length = $total_length + $subseql; + + my $actmp = $ac."_".$count; + + #print STDERR "AC: $actmp\nAC_CONTIG: $count\nDIV: $count\n"; + #print STDERR "$actmp\t$prev_end\t$length\n"; + + + #Load DNA table + my $statement = $db->prepare(" + insert into dna(sequence,created) + values(?, NOW()) + "); + + my $rv = $statement->execute($subseq); + + +#Load contig table + + my $sth = $db->prepare(" + insert into contig(name, contig_id, dna_id, length, clone_id, embl_offset) + values(?, ?, ?, ?, ?, ?) + "); + + my $rv = $sth->execute( + $actmp, + $count, + $count, + $subseql, + $clone, + 1, + ); + + print STDERR "SUB: $subseql\tL: $l\n"; + + my $sth = $db->prepare("insert into assembly (chromosome_id,chr_start,chr_end,superctg_name,superctg_start,superctg_end,superctg_ori,contig_id,contig_start,contig_end,contig_ori,type) values (?,?,?,?,?,?,?,?,?,?,?,?)"); + $sth->execute( + $chr_id, + $prev_end, + $end, + "FPC_".$ac, + $prev_end, + $end, + 1, + $count, + 1, + $t, + 1, + "FLYBASE" + + ); + + + + $prev_end = $end+1; + $internal_count++; + $count++; + + } + + if ($internal_count == $div) { + + my $actmp = $ac."_".$count; + my $subseq = $seq->subseq($prev_end,$length); + my $subseql = length($subseq); + + my $t = $l + 1; + + $total_length = $total_length + $subseql; + +# print STDERR "AC: $actmp\nAC_CONTIG: $count\nDIV: $count\n"; +# print STDERR "$actmp\t$prev_end\t$length\n"; + + #Load DNA table + my $statement = $db->prepare(" + insert into dna(sequence,created) + values(?, NOW()) + "); + + my $rv = $statement->execute($subseq); + + +#Load contig table + + my $sth = $db->prepare(" + insert into contig(name, contig_id, dna_id, length, clone_id, embl_offset) + values(?, ?, ?, ?, ?, ?) + "); + + my $rv = $sth->execute( + $actmp, + $count, + $count, + $subseql, + $clone, + 1, + ); + + my $sth = $db->prepare("insert into assembly (chromosome_id,chr_start,chr_end,superctg_name,superctg_start,superctg_end,superctg_ori,contig_id,contig_start,contig_end,contig_ori,type) values (?,?,?,?,?,?,?,?,?,?,?,?)"); + $sth->execute( + $chr_id, + $prev_end, + $length, + "FPC_".$ac, + $prev_end, + $length, + 1, + $count, + 1, + $subseql, + 1, + "FLYBASE" + + ); + + $count++; + $internal_count++; + if ($total_length =! $l) { + print STDERR "TOTAL: $total_length\nLENGTH: $length\n"; + die; + } + } + } +} + + + + + -- GitLab