Commit 67ec4cd2 authored by cvs2git's avatar cvs2git
Browse files

This commit was manufactured by cvs2svn to create tag 'pre-merge-eg-misc-

scripts'.

Sprout from master 2012-05-10 09:07:55 UTC Nick Langridge <nickl@ebi.ac.uk> 'added seq region name'
Delete:
    docs/pipelines/fasta.html
    docs/pipelines/fasta.markdown
    misc-scripts/protein_match/process_pmach.pl
    modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/ChecksumGenerator.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/ConcatFiles.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/CopyDNA.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/FindDirs.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/Indexer.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/ReuseSpeciesFactory.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/SCPBlast.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/SpeciesFactory.pm
    modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm
    modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm
parent 9d5de234
<h1 id="fasta_pipeline">FASTA Pipeline</h1>
<p>This is a re-implementation of an existing pipeline developed originally by
core and the webteam. The new version uses eHive, so familiarity with this
system is essential, and has been written to use as little memory as possible.</p>
<h2 id="the_registry_file">The Registry File</h2>
<p>This is the way we retrieve the database connections to work with. The
registry file should specify:</p>
<ul>
<li>The core (and any other) databases to dump from</li>
<li>A production database
<ul>
<li><strong>species = multi</strong></li>
<li><strong>group = production</strong></li>
<li>Used to find which species require new DNA</li>
</ul></li>
<li>A web database
<ul>
<li><strong>species = multi</strong></li>
<li><strong>group = web</strong></li>
<li>Used to name BLAT index files</li>
</ul></li>
</ul>
<p>Here is an example of a file for v67 of Ensembl. Note the use of the
Registry object within a registry file and the scoping of the package. If
you omit the <em>-db_version</em> parameter and only use HEAD checkouts of Ensembl
then this will automatically select the latest version of the API. Any
change to version here must be reflected in the configuration file.</p>
<pre><code>package Reg;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
Bio::EnsEMBL::Registry-&gt;no_version_check(1);
Bio::EnsEMBL::Registry-&gt;no_cache_warnings(1);
{
my $version = 67;
Bio::EnsEMBL::Registry-&gt;load_registry_from_multiple_dbs(
{
-host =&gt; "mydb-1",
-port =&gt; 3306,
-db_version =&gt; $version,
-user =&gt; "user",
-NO_CACHE =&gt; 1,
},
{
-host =&gt; "mydb-2",
-port =&gt; 3306,
-db_version =&gt; $version,
-user =&gt; "user",
-NO_CACHE =&gt; 1,
},
);
Bio::EnsEMBL::DBSQL::DBAdaptor-&gt;new(
-HOST =&gt; 'mydb-2',
-PORT =&gt; 3306,
-USER =&gt; 'user',
-DBNAME =&gt; 'ensembl_website',
-SPECIES =&gt; 'multi',
-GROUP =&gt; 'web'
);
Bio::EnsEMBL::DBSQL::DBAdaptor-&gt;new(
-HOST =&gt; 'mydb-2',
-PORT =&gt; 3306,
-USER =&gt; 'user',
-DBNAME =&gt; 'ensembl_production',
-SPECIES =&gt; 'multi',
-GROUP =&gt; 'production'
);
}
1;
</code></pre>
<p>You give the registry to the <strong>init_pipeline.pl</strong> script via the <strong>-registry</strong> option</p>
<h2 id="overriding_defaults_using_a_new_config_file">Overriding Defaults Using a New Config File</h2>
<p>We recommend if you have a number of parameters which do not change
between releases to create a configuration file which inherits from the
root config file e.g.</p>
<pre><code>package MyCnf;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::FASTA_conf/;
sub default_options {
my ($self) = @_;
return {
%{ $self-&gt;SUPER::default_options() },
#Override of options
};
}
1;
</code></pre>
<p>If you do override the config then you should use the package name for your overridden config in the upcoming example commands.</p>
<h2 id="environment">Environment</h2>
<h3 id="perl5lib">PERL5LIB</h3>
<ul>
<li>ensembl</li>
<li>ensembl-hive</li>
<li>bioperl</li>
</ul>
<h3 id="path">PATH</h3>
<ul>
<li>ensembl-hive/scripts</li>
<li>faToTwoBit (if not using a custom location)</li>
<li>xdformat (if not using a custom location)</li>
</ul>
<h3 id="ensembl_cvs_root_dir">ENSEMBL_CVS_ROOT_DIR</h3>
<p>Set to the base checkout of Ensembl. We should be able to add <em>ensembl-hive/sql</em> onto this path to find the SQL directory for hive e.g.</p>
<pre><code>export ENSEMBL_CVS_ROOT_DIR=$HOME/work/ensembl-checkouts
</code></pre>
<h3 id="ensadmin_psw">ENSADMIN_PSW</h3>
<p>Give the password to use to log into a database server e.g.</p>
<pre><code>export ENSADMIN_PSW=wibble
</code></pre>
<h2 id="example_commands">Example Commands</h2>
<h3 id="to_load_use_normally">To load use normally:</h3>
<pre><code>init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -base_path /path/to/dumps -registry reg.pm
</code></pre>
<h3 id="run_a_subset_of_species_no_forcing_supports_registry_aliases">Run a subset of species (no forcing &amp; supports registry aliases):</h3>
<pre><code>init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -species anolis -species celegans -species human \
-base_path /path/to/dumps -registry reg.pm
</code></pre>
<h3 id="specifying_species_to_force_supports_all_registry_aliases">Specifying species to force (supports all registry aliases):</h3>
<pre><code>init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -force_species anolis -force_species celegans -force_species human \
-base_path /path/to/dumps -registry reg.pm
</code></pre>
<h3 id="running_forcing_a_species">Running &amp; forcing a species:</h3>
<pre><code>init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -species celegans -force_species celegans \
-base_path /path/to/dumps -registry reg.pm
</code></pre>
<h3 id="dumping_just_gene_data_no_dna_or_ncrna">Dumping just gene data (no DNA or ncRNA):</h3>
<pre><code>init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -dump_type cdna \
-base_path /path/to/dumps -registry reg.pm
</code></pre>
<h3 id="using_a_different_scp_user_identity">Using a different SCP user &amp; identity:</h3>
<pre><code>init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -scp_user anotherusr -scp_identity /users/anotherusr/.pri/identity \
-base_path /path/to/dumps -registry reg.pm
</code></pre>
<h2 id="running_the_pipeline">Running the Pipeline</h2>
<ol>
<li>Start a screen session or get ready to run the beekeeper with a <strong>nohup</strong></li>
<li>Choose a dump location
<ul>
<li>A fasta, blast and blat directory will be created 1 level below</li>
</ul></li>
<li>Use an <em>init_pipeline.pl</em> configuration from above
<ul>
<li>Make sure to give it the <strong>-base_path</strong> parameter</li>
</ul></li>
<li>Sync the database using one of the displayed from <em>init_pipeline.pl</em></li>
<li><p>Run the pipeline in a loop with a good sleep between submissions and redirect log output (the following assumes you are using <strong>bash</strong>)</p>
<ul>
<li><strong>2>&amp;1</strong> is important as this clobbers STDERR into STDOUT</li>
<li><strong>> my<em>run.log</strong> then sends the output to this file. Use <strong>tail -f</strong> to track the pipeline
beekeeper.pl -url mysql://usr:pass@server:port/db -reg</em>conf reg.pm -loop -sleep 5 2>&amp;1 > my_run.log &amp;</li>
</ul></li>
<li><p>Wait</p></li>
</ol>
# FASTA Pipeline
This is a re-implementation of an existing pipeline developed originally by
core and the webteam. The new version uses eHive, so familiarity with this
system is essential, and has been written to use as little memory as possible.
## The Registry File
This is the way we retrieve the database connections to work with. The
registry file should specify:
* The core (and any other) databases to dump from
* A production database
* **species = multi**
* **group = production**
* Used to find which species require new DNA
* A web database
* **species = multi**
* **group = web**
* Used to name BLAT index files
Here is an example of a file for v67 of Ensembl. Note the use of the
Registry object within a registry file and the scoping of the package. If
you omit the *-db_version* parameter and only use HEAD checkouts of Ensembl
then this will automatically select the latest version of the API. Any
change to version here must be reflected in the configuration file.
package Reg;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
Bio::EnsEMBL::Registry->no_version_check(1);
Bio::EnsEMBL::Registry->no_cache_warnings(1);
{
my $version = 67;
Bio::EnsEMBL::Registry->load_registry_from_multiple_dbs(
{
-host => "mydb-1",
-port => 3306,
-db_version => $version,
-user => "user",
-NO_CACHE => 1,
},
{
-host => "mydb-2",
-port => 3306,
-db_version => $version,
-user => "user",
-NO_CACHE => 1,
},
);
Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-HOST => 'mydb-2',
-PORT => 3306,
-USER => 'user',
-DBNAME => 'ensembl_website',
-SPECIES => 'multi',
-GROUP => 'web'
);
Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-HOST => 'mydb-2',
-PORT => 3306,
-USER => 'user',
-DBNAME => 'ensembl_production',
-SPECIES => 'multi',
-GROUP => 'production'
);
}
1;
You give the registry to the **init_pipeline.pl** script via the **-registry** option
## Overriding Defaults Using a New Config File
We recommend if you have a number of parameters which do not change
between releases to create a configuration file which inherits from the
root config file e.g.
package MyCnf;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::FASTA_conf/;
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() },
#Override of options
};
}
1;
If you do override the config then you should use the package name for your overridden config in the upcoming example commands.
## Environment
### PERL5LIB
* ensembl
* ensembl-hive
* bioperl
### PATH
* ensembl-hive/scripts
* faToTwoBit (if not using a custom location)
* xdformat (if not using a custom location)
### ENSEMBL\_CVS\_ROOT\_DIR
Set to the base checkout of Ensembl. We should be able to add *ensembl-hive/sql* onto this path to find the SQL directory for hive e.g.
export ENSEMBL_CVS_ROOT_DIR=$HOME/work/ensembl-checkouts
### ENSADMIN\_PSW
Give the password to use to log into a database server e.g.
export ENSADMIN_PSW=wibble
## Example Commands
### To load use normally:
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -base_path /path/to/dumps -registry reg.pm
### Run a subset of species (no forcing & supports registry aliases):
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -species anolis -species celegans -species human \
-base_path /path/to/dumps -registry reg.pm
### Specifying species to force (supports all registry aliases):
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -force_species anolis -force_species celegans -force_species human \
-base_path /path/to/dumps -registry reg.pm
### Running & forcing a species:
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -species celegans -force_species celegans \
-base_path /path/to/dumps -registry reg.pm
### Dumping just gene data (no DNA or ncRNA):
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -dump_type cdna \
-base_path /path/to/dumps -registry reg.pm
### Using a different SCP user & identity:
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig:FASTA_conf \
-pipeline_db -host=my-db-host -scp_user anotherusr -scp_identity /users/anotherusr/.pri/identity \
-base_path /path/to/dumps -registry reg.pm
## Running the Pipeline
1. Start a screen session or get ready to run the beekeeper with a **nohup**
2. Choose a dump location
* A fasta, blast and blat directory will be created 1 level below
3. Use an *init_pipeline.pl* configuration from above
* Make sure to give it the **-base_path** parameter
4. Sync the database using one of the displayed from *init_pipeline.pl*
5. Run the pipeline in a loop with a good sleep between submissions and redirect log output (the following assumes you are using **bash**)
* **2>&1** is important as this clobbers STDERR into STDOUT
* **> my_run.log** then sends the output to this file. Use **tail -f** to track the pipeline
beekeeper.pl -url mysql://usr:pass@server:port/db -reg_conf reg.pm -loop -sleep 5 2>&1 > my_run.log &
6. Wait
use strict;
=head1 Process pmatch
=head1 Description
=head2 Aims
This script aims to run pmatch and postprocess pmatch to map Ensembl peptides to external databases (currently Swissprot and Refseq but may be extented). The first part of the script runs pmatch, the second part gets the percentage of a match of a unique Ensembl peptide which match to an unique external protein. The third part classify each ensembl match as PRIMARY match (the longest one and the one which will be used for the mapping, PSEUDO, DUPLICATE and REPEAT (pretty arbitrary criterias but may be useful for quality control).
NB: All of the intermediary files are written.
=head2 Options
-ens : Ensembl peptide fasta file
-sp : SP, SPTREMBL fasta file
-refseq: Refseq peptide fasta file
=head2 Contacts
mongin@ebi.ac.uk
birney@ebi.ac.uk
=cut
use Getopt::Long;
my ($ens,$sp,$refseq,$pdb);
&GetOptions(
'ens:s'=>\$ens,
'sp:s'=>\$sp,
'refseq:s'=>\$refseq,
'pdb:s'=>\$pdb
);
&runpmatch();
&postprocesspmatch($sp);
&postprocesspmatch($refseq);
<<<<<<< process_pmach.pl
=======
&postprocesspmatch($pdb);
>>>>>>> 1.8
&finalprocess($sp);
&finalprocess($refseq);
&finalprocess($pdb);
#perl ../../../src/ensembl-live/misc-scripts/protein_match/process_pmach.pl -ens ../primary/SPAN_pepfile -sp ../primary/SPTr.human.expanded -refseq ../primary/hs2.fsa -pdb ../primary/scop_human.fas
sub runpmatch {
print STDERR "Running pmatch\n";
#Run pmatch and store the data in files which will be kept for debugging
my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $sp $ens > ens_sp_rawpmatch";
my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $refseq $ens > ens_refseq_rawpmatch";
#my $pmatch3 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $pdb $ens > ens_pdb_rawpmatch";
system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!";
system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!";
#system($pmatch3); #== 0 or die "$0\Error running '$pmatch2' : $!";
}
sub postprocesspmatch {
my ($db) = @_;
my %hash1;
my %hashlength;
#Post process the raw data from pmatch
if ($db eq $sp) {
print STDERR "Postprocessing pmatch for SP mapping\n";
open (OUT, ">ens_sp.processed") || die "Can't open File\n";
open (PROC, "ens_sp_rawpmatch") || die "Can't open File\n";
}
elsif ($db eq $refseq) {
print STDERR "Postprocessing pmatch for REFSEQ mapping\n";
open (OUT, ">ens_refseq.processed") || die "Can't open File\n";;
open (PROC, "ens_refseq_rawpmatch") || die "Can't open file ens_refseq_rawpmatch\n";
}
elsif ($db eq $pdb) {
print STDERR "Postprocessing pmatch for PDB mapping\n";
open (OUT, ">ens_pdb.processed") || die "Can't open File\n";;
open (PROC, "ens_pdb_rawpmatch") || die "Can't open file ens_pdb_rawpmatch\n";
}
while (<PROC>) {
#538 COBP00000033978 1 538 35.3 Q14146 1 538 35.3
my ($len,$id,$start,$end,$tperc,$query,$qst,$qend,$perc) = split;
if ($db eq $refseq) {
#Get only the refseq ac (NP_\d+)
($query) = $query =~ /\w+\|\d+\|\w+\|(\w+)/;
}
my $uniq = "$id:$query";
#Add the percentage of similarity for the Ensembl peptide for a single match
#There is a bug at this step, some similarities can be over 100% !!! This problem may be solved by changing pmatch source code
$hash1{$uniq} += $perc;
$hashlength{$uniq} += $len;
}
#Write out the processed data
foreach my $key ( keys %hash1 ) {
#if (($hashlength{$key} >= 20)) {
if (($hash1{$key} >= 25)) {
($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n";
}
#else {
# print "$a\t$b\t$hash1{$key}\t$hashlength{$key}\n";
#}
}
close (PROC);
close (OUT);
}
sub finalprocess {
#This final subroutine will use the postprocessed pmatch file and get back the best Ensembl match (labelled as PRIMARY) for a given external known protein.
my ($db) = @_;
if ($db eq $sp) {
print STDERR "Getting final mapping for SP mapping\n";
open (PROC, "ens_sp.processed");
open (OUT, ">ens_sp.final");
}
elsif ($db eq $refseq) {
print STDERR "Getting final mapping for REFSEQ mapping\n";
open (PROC, "ens_refseq.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">ens_refseq.final");
}
elsif ($db eq $pdb) {
print STDERR "Getting final mapping for PDB mapping\n";
open (PROC, "ens_pdb.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">ens_pdb.final");
}
my %hash2;
while (<PROC>) {
my ($ens,$known,$perc) = split;
#if ($perc > 100) {
# print "$ens\t$known\t$perc\n";
#}
if( !defined $hash2{$known} ) {
$hash2{$known} = [];
}
#Each single external protein correspond to an array of objects dealing with the name and the percentage of similarity of the Ensembl peptide matching with the the known external protein.
my $p= NamePerc->new;
$p->name($ens);
$p->perc($perc);
push(@{$hash2{$known}},$p);
}
foreach my $know ( keys %hash2 ) {
my @array = @{$hash2{$know}};
@array = sort { $b->perc <=> $a->perc } @array;
#The Ensembl match to the known protein is labelled as PRIMARY and will be used later for the mapping
my $top = shift @array;
#if ($top->perc >= 20) {
print OUT "$know\t",$top->name,"\t",$top->perc,"\tPRIMARY\n";
foreach $ens ( @array ) {
if( $ens->perc > $top->perc ) {
die "Not good....";
}
}
#If there is more than 20 Ensembl peptides matching a single known protein, these Ensembl peptides are labelled as REPEAT
if (scalar(@array) >= 20) {
foreach my $repeat (@array) {
if( $repeat->perc+1 >= $top->perc ) {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tDUPLICATE\n";
}
else {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tREPEAT\n";
}
}
}
#If less than 20, either duplicate if percentage of identity close to the PRIMARY labelled as DUPLICATE or labelled as PSEUDO. DUPLICATEs can also be used for the mapping
if (scalar(@array) < 20) {
foreach my $duplicate (@array) {
if( $duplicate->perc+1 >= $top->perc ) {
print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tDUPLICATE\n";
}
else {
print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tPSEUDO\n";
}
}
}
}
#}
close (PROC);
close (OUT);
}
#Set of objects to deal with the script
package NamePerc;
sub new {
my $class= shift;
my $self = {};
bless $self,$class;
return $self;
}
=head2 name
Title : name
Usage : $obj->name($newval)
Function:
Returns : value of name
Args : newvalue (optional)
=cut