BlatIndexer.pm

=pod

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 NAME

Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer

=head1 DESCRIPTION

Creates 2bit file of the given GZipped file. The resulting index
is created under the parameter location I<base_path> in blat. The filename
is prefixed with the port number of the blat server this file should be
run on.

The module also performs filtering of non-reference sequence regions
and can filter the redundant Y chromosome piece for human (as 2bit does
not like repeated sequence region names).

Allowed parameters are:

=over 8

=item file - The file to index

=item program - The location of the faToTwoBit program

=item port_offset - Value to add onto the species_id from the website DB
                    to name the file correctly

=item base_path - The base of the dumps

=item index     - The type of file to index; supported values are empty, 
                  I<dna> or I<dna_rm>. If specified we will look for this
                  string in the filename surrounded by '.' e.g. .dna.

=back

The registry should also have a DBAdaptor for the website schema 
registered under the species B<multi> and the group B<web> for species_id to
Blat port number. 

=cut


package Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer;

use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;

use File::Spec;
use File::stat;
use Bio::EnsEMBL::Utils::IO qw/work_with_file/;
use Bio::EnsEMBL::Utils::Exception qw/throw/;
use Bio::EnsEMBL::Registry;

sub param_defaults {
  my ($self) = @_;
  return {
    program => 'faToTwoBit',
    port_offset => 30000,
    'index' => 'dna', #or dna_rm/''
  };
}

sub run {
  my ($self) = @_;
  if($self->run_indexing()) {
    $self->SUPER::run();
  }
  return;
}

sub run_indexing {
  my ($self) = @_;
  my $index = $self->param('index');
  if($index) {
    my $file = $self->param('file');
    return (index($file, ".${index}.") > -1) ? 1 : 0;
  }
  return 1;
}

sub index_file {
  my ($self, $file) = @_;
  
  my $target_file = $self->target_file();
  my $cmd = sprintf(q{%s %s %s}, 
    $self->param('program'), $file, $target_file);
  
  $self->info('About to run "%s"', $cmd);
  system($cmd) and throw "Cannot run program '$cmd'";
  unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
  
  #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit
  my $filesize = stat($target_file)->size();
  if($filesize <= 16) {
    unlink $file;
    my $msg = sprintf(
      'The file %s produced a 2bit file %d byte(s). Lower than 17 bytes therefore empty 2 bit file',
      $file, $filesize
    );
    $self->throw($msg);
  }
  
  return;
}

sub decompress {
  my ($self) = @_;
  
  #If we have no non-reference seq regions then use normal decompress
  if(! $self->has_non_refs()) {
    return $self->SUPER::decompress();
  }
  
  #Filter for non-refs
  my $source = $self->param('file');
  my $target_dir = $self->target_dir();
  my ($vol, $dir, $file) = File::Spec->splitpath($source);
  $file =~ s/.gz$//;
  my $target = File::Spec->catdir($target_dir, $file);
  
  my $allowed_regions = $self->allowed_regions();
  
  $self->info('Decompressing %s to %s', $source, $target);
  
  open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target";
  work_with_file($target, 'w', sub {
    my ($trg_fh) = @_;
    my $transfer = 0;
    while(my $line = <$src_fh>) {
      #HEADER
      if(index($line, '>') == 0) {
        #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
        my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; 
        $transfer = ($allowed_regions->{$name}) ? 1 : 0;
        if($transfer) {
          $self->info('%s was an allowed Slice', $name);
        }
        else {
          $self->info('%s will be skipped; not a reference Slice', $name);
        }
      }
      print $trg_fh $line if $transfer;
    }
  });
  close($src_fh);

  return $target;
}

sub allowed_regions {
  my ($self) = @_;
  my $filter_human = 1;
  my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
  my %hash = map { $_->name() => 1 } @slices;
  return \%hash;
}

#Filename like 30061.Homo_sapiens.GRCh37.2bit
sub target_filename {
  my ($self) = @_;
  my $port = $self->blat_port();
  my $name = $self->web_name();
  my $assembly = $self->assembly();
  return join(q{.}, $port, $name, $assembly, '2bit');
}

sub target_file {
  my ($self) = @_;
  my $target_dir = $self->target_dir();
  my $target_filename = $self->target_filename();
  return File::Spec->catfile($target_dir, $target_filename);
  return;
}

sub target_dir {
  my ($self) = @_;
  return $self->get_dir('blat');
}

sub blat_port {
  my ($self) = @_;
  my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'web');
  my $id = $dba->dbc()->sql_helper()->execute_single_result(
    -SQL => 'select species_id from species where name =?',
    -PARAMS => [$self->web_name()]
  );
  return $id + $self->param('port_offset');
}

sub has_non_refs {
  my ($self) = @_;
  my $sql = <<'SQL';
select count(*)
from attrib_type at
join seq_region_attrib sra using (attrib_type_id)
join seq_region sr using (seq_region_id)
join coord_system cs using (coord_system_id)
where cs.species_id =?
and at.code =?
SQL
  my $dba = $self->get_DBAdaptor();
  return $dba->dbc()->sql_helper()->execute_single_result(
    -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
}

1;