Skip to content
Snippets Groups Projects
variation_density.pl 7.42 KiB
Newer Older
#!/usr/local/ensembl/bin/perl -w
# Calculates the variation density from given core database

use Bio::EnsEMBL::Registry;
use Getopt::Long;
use Data::Dumper;
$Data::Dumper::Maxdepth = 2;

Arne Stabenau's avatar
Arne Stabenau committed

my ($host, $user, $pass, $port, $species);
# Master database location:
my ( $mhost, $mport ) = ( 'ens-staging1', '3306' );
my ( $muser, $mpass ) = ( 'ensro',        undef );
my $mdbname = 'ensembl_production';
my ($block_count, $genome_size, $block_size );
GetOptions( "host|h=s",     \$host,
	    "user|u=s",     \$user,
	    "pass|p=s",     \$pass,
	    "port=i",       \$port,
	    "mhost=s", \$mhost,
	    "mport=i", \$mport,
	    "muser=s", \$muser,
	    "species|s=s",  \$species );
usage() if (!$host || !$user || !$pass || !$species );
my $reg = 'Bio::EnsEMBL::Registry';
$reg->load_registry_from_db(-host => $host, -user => $user, -pass => $pass, -port => $port, -species => $species);

my $density_feature_adaptor   = $reg->get_adaptor($species, "core", "DensityFeature")        || die "Can't create density feature adaptor";
my $density_type_adaptor      = $reg->get_adaptor($species, "core", "DensityType")           || die "Can't create density type adaptor";
my $analysis_adaptor          = $reg->get_adaptor($species, "core", "analysis")              || die "Can't create analysis adaptor";
my $slice_adaptor             = $reg->get_adaptor($species, "core", "slice")                 || die "Can't create slice adaptor";

my $variation_feature_adaptor = $reg->get_adaptor($species, "variation", "VariationFeature") || die "Can't create variation feature adaptor";

# release 63: do not delete analysis, as this is synchronized with production!
my $sth = $slice_adaptor->dbc->prepare("DELETE df, dt FROM analysis_description ad, density_feature df, density_type dt, analysis a WHERE ad.analysis_id = a.analysis_id AND a.analysis_id=dt.analysis_id AND dt.density_type_id=df.density_type_id AND a.logic_name='snpdensity'");
Glenn Proctor's avatar
Glenn Proctor committed
$sth->execute();
# Sort slices by coordinate system rank, then by length
my @sorted_slices = sort( {
               $a->coord_system()->rank() <=> $b->coord_system()->rank()
                 || $b->seq_region_length() <=> $a->seq_region_length()
} @{ $slice_adaptor->fetch_all('toplevel')} );
# Cannot use this due to PAR regions & basefeatureadaptor's fetch method 
#understanding par region translation
#} @{ $slice_adaptor->fetch_all('toplevel', '', undef, 1) } );

my $analysis = $analysis_adaptor->fetch_by_logic_name('snpdensity');

if ( !defined($analysis) ) {

   my $prod_dsn = sprintf( 'DBI:mysql:host=%s;port=%d;database=%s',
                     $mhost, $mport, $mdbname );
   my $prod_dbh = DBI->connect( $prod_dsn, $muser, $mpass,
                          { 'PrintError' => 1, 'RaiseError' => 1 } );

   my ($display_label,$description) = $prod_dbh->selectrow_array("select distinct display_label, description from analysis_description where is_current = 1 and logic_name = 'snpdensity'");

   $prod_dbh->disconnect;

   $analysis = new Bio::EnsEMBL::Analysis(
              -program     => "variation_density.pl",
              -database    => "ensembl",
              -gff_source  => "variation_density.pl",
              -gff_feature => "density",
              -logic_name  => "snpdensity",
              -description => $description,
              -display_label => $display_label,
              -displayable   => 1 );

    $analysis_adaptor->store($analysis);
} else {

    my $support = 'Bio::EnsEMBL::Utils::ConversionSupport';
    $analysis->created($support->date());
    $analysis_adaptor->update($analysis);

}
my $dt = Bio::EnsEMBL::DensityType->new(-analysis        => $analysis,
Arne Stabenau's avatar
Arne Stabenau committed
					-region_features => $bin_count,
					-value_type      => 'sum');
$density_type_adaptor->store($dt);

# Now the actual feature calculation loop

Arne Stabenau's avatar
Arne Stabenau committed
my $slice_count = 0;

my ($current, $current_start, $current_end);
# prepare statement outside of loop
$sth = $variation_feature_adaptor->prepare("SELECT COUNT(*) FROM variation_feature WHERE seq_region_id = ? AND seq_region_start < ? AND seq_region_end > ?");
my $total_count = 0;

while ( my $slice = shift @sorted_slices){
Arne Stabenau's avatar
Arne Stabenau committed

  $block_size = $slice->length() / $bin_count;
  my @density_features;

  print STDOUT "Calculating SNP densities for ". $slice->seq_region_name() . " with block size $block_size\n";
Arne Stabenau's avatar
Arne Stabenau committed
  $current_end = 0;
  $current = 0;

Arne Stabenau's avatar
Arne Stabenau committed

    $current += $block_size;
    $current_start = $current_end+1;
    $current_end = int( $current + 1 );

Arne Stabenau's avatar
Arne Stabenau committed
      $current_end = $current_start;
    }

      $current_end = $slice->end();
    }

    # Get count of SNPs in this region
    $sth->execute($slice->get_seq_region_id(), $current_end, $current_start);
    my $count = ($sth->fetchrow_array())[0];
    push @density_features, Bio::EnsEMBL::DensityFeature->new(-seq_region    => $slice,
					       -start         => $current_start,
        				       -end           => $current_end,
        				       -density_type  => $dt,
        				       -density_value => $count);
    if ($count > 0) {
       #density features with value = 0 are not stored
       $total_count++;
Arne Stabenau's avatar
Arne Stabenau committed

  $density_feature_adaptor->store(@density_features);

Arne Stabenau's avatar
Arne Stabenau committed

print STDOUT "Written $total_count density features for species $species on server $host\n";


sub usage {
  my $indent = ' ' x length($0);
  print <<EOF; exit(0);

What does it do?

Calculates the density of SNP features on top level sequences.
Deletes out all density_feature and density_type entries having
analysis logic_name 'snpDensity'. Deletes analysis_description
where display_label = 'snpDensity'. All toplevel slices are fetched
and sorted from longest to shortest. Each slice is divided into 150
bins. For each sub_slice, we count and store the number of
variation_features (SNPs) on that sub_slice.


Deletes out all seq_region_attrib that have attrib_type code of 'SNPCount'. 
Attach variation db if exists. All toplevel slices are fetched. 
For each slice, count the number of SNPs.

Input data: top level seq regions, variation db
Output tables: updates analysis creation date, density_feature, density_type

The script requires ensembl-variation in perl5lib.

When to run it in the release cycle?

When variation dbs have been handed over


Which databases to run it on?

The script updates a core database using data from the corresponding variation database. 
Run it for new species or where the core assembly has changed, or if there are any changes to variation positions in the variation database.


How long does it take?

It takes about 25 mins to run for a database in normal queue.



Usage: 

  $0 -h host [-port port] -u user -p password \\
  $indent -s species \\
  $indent [-mhost ensembl_production host] [-mport ensembl_production host] [-muser ensembl_production host] \\
  $indent [-help]  \\

  -h|host             Database host to connect to

  -port               Database port to connect to (default 3306)

  -u|user             Database username

  -p|pass             Password for user

  -s|species          Species name

  -mhost              ensembl_production database host to connect to

  -mport              ensembl_production database port to connect to

  -muser              ensembl_production database username