From c70d4ffb697148bbfe9273fc05b513f4fd4a8dcf Mon Sep 17 00:00:00 2001 From: Glenn Proctor <gp1@sanger.ac.uk> Date: Mon, 11 Oct 2004 16:02:48 +0000 Subject: [PATCH] Removed - functionality now in BasicMapper.pm --- misc-scripts/xref_mapping/ensembl_dumper.pl | 38 -- .../xref_mapping/exonerate_prototype.pl | 399 ------------------ misc-scripts/xref_mapping/xref_dumper.pl | 34 -- 3 files changed, 471 deletions(-) delete mode 100644 misc-scripts/xref_mapping/ensembl_dumper.pl delete mode 100644 misc-scripts/xref_mapping/exonerate_prototype.pl delete mode 100644 misc-scripts/xref_mapping/xref_dumper.pl diff --git a/misc-scripts/xref_mapping/ensembl_dumper.pl b/misc-scripts/xref_mapping/ensembl_dumper.pl deleted file mode 100644 index 42098df35d..0000000000 --- a/misc-scripts/xref_mapping/ensembl_dumper.pl +++ /dev/null @@ -1,38 +0,0 @@ -# Dump Ensembl sequences to fasta file - -use strict; - -use Bio::EnsEMBL::DBSQL::DBAdaptor; - -my $host = 'ecs2'; -my $port = 3364; -my $user = 'ensro'; -my $dbname = 'homo_sapiens_core_25_34e'; - -my $file = "ensembl_transcripts.fasta"; - -my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $host, - -port => $port, - -user => $user, - -dbname => $dbname); - -my $slice_adaptor = $db->get_SliceAdaptor(); -my $slice = $slice_adaptor->fetch_by_region('chromosome', '21'); - -my $t = 0; - -open(FILE, ">" . $file); - -foreach my $gene (@{$slice->get_all_Genes()}) { - - foreach my $trans (@{$gene->get_all_Transcripts()}) { - - print FILE ">" . $trans->dbID() . "\n" . $trans->spliced_seq() . "\n"; - $t++; - - } -} - -close(FILE); - -print "Wrote $t transcripts to $file\n"; diff --git a/misc-scripts/xref_mapping/exonerate_prototype.pl b/misc-scripts/xref_mapping/exonerate_prototype.pl deleted file mode 100644 index f19c6ee78a..0000000000 --- a/misc-scripts/xref_mapping/exonerate_prototype.pl +++ /dev/null @@ -1,399 +0,0 @@ -use strict; - -use DBI; -use File::Basename; -use IPC::Open3; - -# Use exonerate (or other program) to find xref-ensembl obejct mappings - - -# XXX -my $queryfile = "xref_dna.fasta"; -my $targetfile = "ensembl_transcripts.fasta"; - -run_mapping($queryfile, $targetfile, "."); -store($targetfile); - -sub run_mapping { - - my ($queryfile, $targetfile, $root_dir) = @_; - - # get list of methods - my @methods = ("ExonerateBest1"); # TODO get from Ian, maybe files as well - - # foreach method, submit the appropriate job & keep track of the job name - my @job_names; - - foreach my $method (@methods) { - - my $obj_name = "XrefMapper::Methods::$method"; - # check that the appropriate object exists - eval "require $obj_name"; - if($@) { - - warn("Could not find object $obj_name corresponding to mapping method $method, skipping\n$@"); - - } else { - - my $obj = $obj_name->new(); - my $job_name = $obj->run($queryfile, $targetfile); - push @job_names, $job_name; - print "Submitted LSF job $job_name to list\n"; - sleep 1; # make sure unique names really are unique - - } - - } # foreach method - - # submit depend job to wait for all mapping jobs - submit_depend_job($root_dir, @job_names); - - -} # run_exonerate - - -sub submit_depend_job { - - my ($root_dir, @job_names) = @_; - - # Submit a job that does nothing but wait on the main jobs to - # finish. This job is submitted interactively so the exec does not - # return until everything is finished. - - # build up the bsub command; first part - my @depend_bsub = ('bsub', '-K'); - - # one -wended clause for each main job - foreach my $job (@job_names) { - push @depend_bsub, "-wended($job)"; - } - - # rest of command - push @depend_bsub, ('-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true'); - - #print "depend bsub:\n" . join (" ", @depend_bsub) . "\n"; - - my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid); - $depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub); - my $depend_jobid; - while (<$depend_rtr>) { - if (/Job <([0-9]+)> is/) { - $depend_jobid = $1; - print "LSF job ID for depend job: $depend_jobid \n" ; - } - } - if (!defined($depend_jobid)) { - print STDERR "Error: could not get depend job ID\n"; - } - - - -} - -=head2 store - - Arg[1] : The target file used in the exonerate run. Used to work out the Ensembl object type. - Arg[2] : - Example : none - Description: Parse exonerate output files and build files for loading into target db tables. - Returntype : List of strings - Exceptions : none - Caller : general - -=cut - -sub store { - - my ($target_file_name) = @_; - - my $type = get_ensembl_object_type($target_file_name); - - # get or create the appropriate analysis ID - my $analysis_id = get_analysis_id($type); - - # TODO - get this from config - my $dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=arne_core_20_34", - "ensadmin", - "ensembl", - {'RaiseError' => 1}) || die "Can't connect to database"; - - # get current max object_xref_id - my $max_object_xref_id = 0; - my $sth = $dbi->prepare("SELECT MAX(object_xref_id) FROM object_xref"); - $sth->execute(); - my $max_object_xref_id = ($sth->fetchrow_array())[0]; - if (!defined $max_object_xref_id) { - print "Can't get highest existing object_xref_id, using 1\n)"; - } else { - print "Maximum existing object_xref_id = $max_object_xref_id\n"; - } - - - #my $ox_sth = $dbi->prepare("INSERT INTO object_xref(ensembl_id, ensembl_object_type, xref_id) VALUES(?,?,?)"); - - #my $ix_sth = $dbi->prepare("INSERT INTO identity_xref VALUES(?,?,?,?,?,?,?,?,?,?,?)"); - - # files to write table data to - open (OBJECT_XREF, ">object_xref.txt"); - open (IDENTITY_XREF, ">identity_xref.txt"); - - my $total_lines = 0; - my $total_files = 0; - - my $object_xref_id = $max_object_xref_id + 1; - - # keep a (unique) list of xref IDs that need to be written out to file as well - my %primary_xref_ids; - - foreach my $file (glob("*.map")) { - - print "Parsing results from $file \n"; - open(FILE, $file); - $total_files++; - - while (<FILE>) { - - $total_lines++; - chomp(); - my ($label, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score) = split(/:/, $_); - $cigar_line =~ s/ //; - - # TODO make sure query & target are the right way around - - print OBJECT_XREF "$object_xref_id\t$target_id\t$type\t$query_id\n"; - print IDENTITY_XREF "$object_xref_id\t$query_id\t$target_id\t$query_start\t$query_end\t$target_start\t$target_end\t$cigar_line\t$score\t\\N\t$analysis_id\n"; - # TODO - evalue? - $object_xref_id++; - - $primary_xref_ids{$query_id} = $query_id; - - # Store in database - # create entry in object_xref and get its object_xref_id - #$ox_sth->execute($target_id, $type, $query_id) || warn "Error writing to object_xref table"; - #my $object_xref_id = $ox_sth->{'mysql_insertid'}; - - # create entry in identity_xref - #$ix_sth->execute($object_xref_id, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score, undef, $analysis_id) || warn "Error writing to identity_xref table"; - - } - - close(FILE); - - } - - close(IDENTITY_XREF); - close(OBJECT_XREF); - - print "Read $total_lines lines from $total_files exonerate output files\n"; - - # write relevant xrefs to file - dump_xrefs(\%primary_xref_ids); - -} - - -sub get_ensembl_object_type { - - my $filename = shift; - my $type; - - if ($filename =~ /gene/i) { - - $type = "Gene"; - - } elsif ($filename =~ /transcript/i) { - - $type = "Transcript"; - - } elsif ($filename =~ /translation/i) { - - $type = "Translation"; - - } else { - - print STDERR "Cannot deduce Ensembl object type from filename $filename"; - } - - return $type; - -} - - -sub get_analysis_id { - - my $ensembl_type = shift; - - my %typeToLogicName = ( 'transcript' => 'XrefExonerateDNA', - 'translation' => 'XrefExonerateProtein' ); - - my $logic_name = $typeToLogicName{lc($ensembl_type)}; - - # TODO - get these details from Config - my $host = "ecs1g"; - my $port = 3306; - my $database = "arne_core_20_34"; - my $user = "ensadmin"; - my $password = "ensembl"; - - my $dbi = DBI->connect("dbi:mysql:host=$host;port=$port;database=$database", - "$user", - "$password", - {'RaiseError' => 1}) || die "Can't connect to database"; - - - my $sth = $dbi->prepare("SELECT analysis_id FROM analysis WHERE logic_name='" . $logic_name ."'"); - $sth->execute(); - - my $analysis_id; - - if (my @row = $sth->fetchrow_array()) { - - $analysis_id = $row[0]; - print "Found exising analysis ID ($analysis_id) for $logic_name\n"; - - } else { - - print "No analysis with logic_name $logic_name found, creating ...\n"; - $sth = $dbi->prepare("INSERT INTO analysis (logic_name, created) VALUES ('" . $logic_name. "', NOW())"); - # TODO - other fields in analysis table - $sth->execute(); - $analysis_id = $sth->{'mysql_insertid'}; - print "Done (analysis ID=" . $analysis_id. ")\n"; - - } - - return $analysis_id; - -} - - -sub dump_xrefs { - - my $xref_ids_hashref = shift; - my @xref_ids = keys %$xref_ids_hashref; - - open (XREF, ">xref.txt"); - - # TODO - get this from config - my $xref_dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=glenn_test_xref", - "ensro", - "", - {'RaiseError' => 1}) || die "Can't connect to database"; - - my $core_dbi = DBI->connect("dbi:mysql:host=ecs1g;port=3306;database=arne_core_20_34", - "ensro", - "", - {'RaiseError' => 1}) || die "Can't connect to database"; - - # get current highest internal ID from xref - my $max_xref_id = 0; - my $core_sth = $core_dbi->prepare("SELECT MAX(xref_id) FROM xref"); - $core_sth->execute(); - my $max_xref_id = ($core_sth->fetchrow_array())[0]; - if (!defined $max_xref_id) { - print "Can't get highest existing xref_id, using 0\n)"; - } else { - print "Maximum existing xref_id = $max_xref_id\n"; - } - my $core_xref_id = $max_xref_id + 1; - - # keep a unique list of source IDs to build the external_db table later - my %source_ids; - - # execute several queries with a max of 200 entries in each IN clause - more efficient - my $batch_size = 200; - - while(@xref_ids) { - - my @ids; - if($#xref_ids > $batch_size) { - @ids = splice(@xref_ids, 0, $batch_size); - } else { - @ids = splice(@xref_ids, 0); - } - - my $id_str; - if(@ids > 1) { - $id_str = "IN (" . join(',', @ids). ")"; - } else { - $id_str = "= " . $ids[0]; - } - - - my $sql = "SELECT * FROM xref WHERE xref_id $id_str"; - my $xref_sth = $xref_dbi->prepare($sql); - $xref_sth->execute(); - - my ($xref_id, $accession, $label, $description, $source_id, $species_id); - $xref_sth->bind_columns(\$xref_id, \$accession, \$label, \$description, \$source_id, \$species_id); - - # note the xref_id we write to the file is NOT the one we've just read - # from the internal xref database as the ID may already exist in the core database - while (my @row = $xref_sth->fetchrow_array()) { - print XREF "$core_xref_id\t$accession\t$label\t$description\n"; - $source_ids{$source_id} = $source_id; - $core_xref_id++; - if ($source_id == 1001) { - print "xref $xref_id has source_id 1001\n"; - } - } - - # Now get the dependent xrefs for each of these xrefs and write them as well - $sql = "SELECT x.accession, x.label, x.description, x.source_id FROM dependent_xref dx, xref x WHERE x.xref_id=dx.master_xref_id AND master_xref_id $id_str"; - my $dep_sth = $xref_dbi->prepare($sql); - $dep_sth->execute(); - - $dep_sth->bind_columns(\$accession, \$label, \$description, \$source_id); - while (my @row = $dep_sth->fetchrow_array()) { - print XREF "$core_xref_id\t$accession\t$label\t$description\tDEPENDENT\n"; - $source_ids{$source_id} = $source_id; - $core_xref_id++; - } - #print "source_ids: " . join(" ", keys(%source_ids)) . "\n"; - - } # while @xref_ids - - close(XREF); - - # now write the exernal_db file - the %source_ids hash will contain the IDs of the - # sources that need to be written as external_dbs - open(EXTERNAL_DB, ">external_db.txt"); - - # get current highest internal ID from external_db - my $max_edb_id = 0; - my $core_sth = $core_dbi->prepare("SELECT MAX(external_db_id) FROM external_db"); - $core_sth->execute(); - my $max_edb_id = ($core_sth->fetchrow_array())[0]; - if (!defined $max_edb_id) { - print "Can't get highest existing external_db_id, using 0\n)"; - } else { - print "Maximum existing external_db_id = $max_edb_id\n"; - } - my $edb_id = $max_edb_id + 1; - - my @source_id_array = keys %source_ids; - my $source_id_str; - if(@source_id_array > 1) { - $source_id_str = "IN (" . join(',', @source_id_array). ")"; - } else { - $source_id_str = "= " . $source_id_array[0]; - } - - my $source_sql = "SELECT name, release FROM source WHERE source_id $source_id_str"; - my $source_sth = $xref_dbi->prepare($source_sql); - $source_sth->execute(); - - my ($source_name, $release); - $source_sth->bind_columns(\$source_name, \$release); - - while (my @row = $source_sth->fetchrow_array()) { - print EXTERNAL_DB "$edb_id\t$source_name\t$release\tXREF\n"; - # TODO knownxref etc?? - $edb_id++; - } - - close(EXTERNAL_DB); - - - -} diff --git a/misc-scripts/xref_mapping/xref_dumper.pl b/misc-scripts/xref_mapping/xref_dumper.pl deleted file mode 100644 index af6c7cd63d..0000000000 --- a/misc-scripts/xref_mapping/xref_dumper.pl +++ /dev/null @@ -1,34 +0,0 @@ -# Dump primary xrefs to FASTA file - -use strict; -use DBI; - -my $file = "xref_dna.fasta"; - -my $host = "ecs1g"; -my $port = 3306; -my $database = "glenn_test_xref"; -my $user = "ensadmin"; -my $password = "ensembl"; - -my $dbi = DBI->connect("dbi:mysql:host=$host;port=$port;database=$database", - "$user", - "$password", - {'RaiseError' => 1}) || die "Can't connect to database"; - -open(FILE, ">" . $file); - -my $sth = $dbi->prepare("SELECT x.xref_id, px.sequence FROM primary_xref px, source so, xref x, species sp WHERE sp.name='homo_sapiens' AND so.name='RefSeq' AND x.species_id=sp.species_id AND px.source_id=so.source_id AND x.xref_id=px.xref_id AND px.sequence_type='dna' LIMIT 500"); -$sth->execute(); - -my ($xref_id, $sequence); -$sth->bind_columns(\$xref_id, \$sequence); - -while (my @row = $sth->fetchrow_array()) { - - print FILE ">$xref_id\n$sequence\n"; - -} - -close(FILE); - -- GitLab