Fetch_gff.pl 4.65 KB
Newer Older
1 2 3 4
#!/usr/local/ensembl/bin/perl -w

=head1 LICENSE

5
  Copyright (c) 1999-2011 The European Bioinformatics Institute and
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <ensembl-dev@ebi.ac.uk>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

 Fetch_gff.pl

=head1 AUTHORS

 Gautier Koscielny (koscieln@ebi.ac.uk)

=head1 SYNOPSIS

33
  Fetch_gff.pl -dbhost host -dbuser ensro -dbname homo_sapiens_core_58_37c -output_file homo_sapiens_core_58_37c_variants.gff
34 35 36 37 38 39 40 41

=head1 DESCRIPTION

This script generates a GFF file containing all the transcript variants 
from an Ensembl core database.

here is an example commandline

42
  ./Fetch_gff.pl -dbhost host -dbuser user -dbname my_db -dbpass **** -output_file transcript_variants.gff
43 44 45

=head1 OPTIONS

46 47 48 49 50 51 52
    -dbhost         host name for database (gets put as host= in locator)
    -dbname         what name to connect to (dbname= in locator)
    -dbuser         what username to connect as (dbuser= in locator)
    -dbpass         what password to use (dbpass= in locator)
    -chr            which chromosome (optional)
		-output_file|-o where the GFF output is written (optional, STDOUT by default) 
    -help           displays this documentation with PERLDOC
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

=cut

use strict;
use warnings;

use Bio::EnsEMBL::Feature;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw);

use POSIX qw(ceil);

use Getopt::Long;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);

{ # block to avoid namespace pollution
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
		my $host        = '';
		my $port        = '';
		my $dbname      = '';
		my $dbuser      = '';
		my $dbpass      = '';
		my $chr         = undef;
		my $output_file = undef;
		my $help;
		my @coord_system;

		&GetOptions(
								'dbhost:s'        => \$host,
								'dbport:n'        => \$port,
								'dbname:s'        => \$dbname,
								'dbuser:s'        => \$dbuser,
								'dbpass:s'        => \$dbpass,
								'chr:s'           => \$chr,
								'output_file|o=s' => \$output_file,
								'h|help'          => \$help,
								) or ($help = 1);

		if(!$host || !$dbuser || !$dbname){
				print STDERR "Can't get any information without database details\n";
				print STDERR "-dbhost '$host' -dbuser '$dbuser' -dbname '$dbname' ".
						" -dbpass '$dbpass'\n";
				$help = 1;
		}

		if ($help) {
				exec('perldoc', $0);
		}

		
		my $output_stream;

		if (defined($output_file)) {

				open ($output_stream, ">$output_file") || throw "Can't open '$output_file' file for writing\n";

		} else {

				$output_stream = \*STDOUT;
				print STDERR "Will write GFF stream to the standard output.\n"; 
		}

		my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new
				(-dbname => $dbname,
				 -host   => $host,
				 -user   => $dbuser,
				 -port   => $port,
				 -pass   => $dbpass);


		my $gene_adaptor = $db->get_GeneAdaptor();
		my @stable_gene_ids = undef;
		my $size = 0;

		if (defined($chr)) {

				my $slice_adaptor = $db->get_SliceAdaptor();
				my $slice = $slice_adaptor->fetch_by_region( 'chromosome', $chr );
				@stable_gene_ids = @{ $gene_adaptor->fetch_all_by_Slice($slice) };
				$size = scalar @stable_gene_ids;
				print STDERR "Number of stable genes on region $chr:\t" . $size . "\n";

		} else {

				@stable_gene_ids = @{$gene_adaptor->list_stable_ids()};
				$size = scalar @stable_gene_ids;
				print STDERR "Number of stable ids:\t" . $size . "\n";
		}

		for my $id (@stable_gene_ids) {
				
				my $gene = ($chr) ? $id : $gene_adaptor->fetch_by_stable_id($id);

				my $gene_id = $gene->display_id();
				my $biotype = $gene->biotype();
				my $chr = $gene->slice->seq_region_name();
				my $strand = $gene->strand();
				my $start = $gene->start();
				my $end = $gene->end();
				
				my @transcripts = @{$gene->get_all_Transcripts()};
				for my $transcript (@transcripts) {
						
						my $transcr_id = $transcript->display_id() ; ;
						
						#Get the exons + print info.
						my $exons = $transcript->get_all_Exons() ;
						
						foreach my $exon (@$exons) {
								my $exon_id = $exon->display_id() ;
								my $exon_start = $exon->start() ;
								my $exon_end = $exon->end() ;
								my $exon_std = $exon->strand() ;
								my $slice = $exon->slice->seq_region_name();
								$exon_std =~ s/-1/-/ ;
								$exon_std =~ s/1/+/ ;
								print $output_stream "$chr\tEnsembl\texon\t$exon_start\t$exon_end\t.\t$exon_std\t.\tgene_id \"$gene_id\"; transcript_id \"$transcr_id\"; exon_id \"$exon_id\"\n" ;
						}
				}
		}
		
		exit 0;
174 175

}