Skip to content
Snippets Groups Projects
Commit 04ef7f00 authored by Nathan Johnson's avatar Nathan Johnson
Browse files

initial import of external_feature parsers

parent e18b3dbb
No related branches found
No related tags found
No related merge requests found
...@@ -4,18 +4,187 @@ use strict; ...@@ -4,18 +4,187 @@ use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::SimpleFeature; use Bio::EnsEMBL::SimpleFeature;
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Utils::Exception qw( throw );
use Bio::EnsEMBL::Funcgen::FeatureSet;
use Bio::EnsEMBL::Funcgen::FeatureType;
use Bio::EnsEMBL::Analysis;
# Base functionality for external_feature parsers
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = {};
#my $self = {(
# feature_sets => {(
# vista => {('VISTA enhancer set' => undef)},
# cisred => {(
# 'cisRED search regions' => undef,
# 'cisRED group motifs' => undef,
# )},
# miranda => {('miRanda miRNA' => undef)},
# )},
# )};
bless $self, $class;
#validate and set type, analysis and feature_set here
my ($type, $db, $clobber, $archive) = rearrange(['TYPE', 'DB', 'CLOBBER', 'ARCHIVE'], @_);
throw('You must define a type of external_feature to import') if(! defined $type);
if (! ($db && ref($db) &&
$db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){
throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor');
}
throw('You can only specify either -clobber or -archive, but not both') if($clobber && $archive);
$self->{'db'} = $db;
$self->{type} = $type;
$self->{'clobber'} = $clobber if defined $clobber;
$self->{'archive'} = $archive if defined $archive;
print ":: Parsing and loading $type ExternalFeatures";
return $self;
}
sub db{
my ($self, $db) = @_;
if($db){
if(! $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor')){
throw('You must prove a Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor');
}
$self->{'db'} = $db;
}
return $self->{'db'};
}
sub set_feature_sets{
my $self = shift;
throw('Must provide a set feature_set config hash') if ! defined $self->{'feature_sets'};
my $fset_adaptor = $self->db->get_FeatureSetAdaptor;
my $analysis_adaptor = $self->db->get_AnalysisAdaptor;
foreach my $fset_name(keys %{$self->{feature_sets}}){
#check feature_type is validated?
my $fset = $fset_adaptor->fetch_by_name($fset_name);
#what about data sets?
#we don't need data sets for external_sets!
if(defined $fset){
if($self->{'clobber'}){
#Need to clobber any DBEntries first!!!
if($self->{'type'} eq 'cisred'){
my @ext_feat_ids = @{$self->db->dbc->db_handle->selectall_arrayref('select external_feature_id from external_feature where feature_set_id='.$fset->dbID)};
if(@ext_feat_ids){
my ($core_ext_dbid) = @{$self->db->dbc->db_handle->selectall_arrayref('select external_db_id from external_db where db_name="core"')};
if($core_ext_dbid){
#double table delete?
my $sql = "delete x, ox from object_xref ox, xref x where ox.ensembl_object_type='ExternalFeature' and x.external_db_id=$core_ext_dbid and ox.xref_id=x.xref_id and ensembl_id in(".join(', ', @ext_feat_ids).')';
print ":: Clobbering xrefs for $fset_name\n";
$self->db->dbc->do($sql);
}
}
print ":: Clobbering old features for external feature_set:\t$fset_name\n";
my $sql = 'delete from external_feature where feature_set_id='.$fset->dbID;
$self->db->dbc->do($sql);
}
}
elsif($self->{'archive'}){
my $archive_fset = $fset_adaptor->fetch_by_name($fset_name."_v".$self->{'archive'});
if(defined $archive_fset){
throw("You are trying to create an archive external feature_set which already exists:\t${fset_name}_v".$self->{archive});
}
my $sql = "UPDATE feature_set set name='$fset_name}_v".$self->{archive}."' where name='$fset_name'";
$self->db->dbc->do($sql);
undef $fset;
}else{
throw("You are trying to create an external feature_set which already exists:\t$fset_name\nMaybe to want to clobber or archive?");
}
}
if(!defined $fset){
#don't need to use RNAFeatureType here as this is the setwide generic feature_type
#or do we have separate tables for external_feature and external_rna_feature?
#validate analysis first
my $analysis = $analysis_adaptor->fetch_by_logic_name($self->{'feature_sets'}{$fset_name}{'analysis'}{'-logic_name'});
if(! defined $analysis){
print ':: Analysis '.$self->{'feature_sets'}{$fset_name}{'analysis'}{'-logic_name'}.
" not found, storing from config hash\n";
$analysis_adaptor->store(Bio::EnsEMBL::Analysis->new
(
%{$self->{'feature_sets'}{$fset_name}{'analysis'}}
#-logic_name => $self->{'feature_sets'}{$fset_name}{'analysis'}{'logic_name'},
#-description => $self->{'feature_sets'}{$fset_name}{'analysis'}{'description'},
#-display_label => $self->{'feature_sets'}{$fset_name}{'analysis'}{'display_label'},
#-diplayable => $self->{'feature_sets'}{$fset_name}{'analysis'}{'displayable'},
));
$analysis = $analysis_adaptor->fetch_by_logic_name($self->{'feature_sets'}{$fset_name}{'analysis'});
}
#replace hash config with object
$self->{'feature_sets'}{$fset_name}{'analysis'} = $analysis;
$fset = Bio::EnsEMBL::Funcgen::FeatureSet->new(
-name => $fset_name,
-type => 'external',
-analysis => $self->{'feature_sets'}{$fset_name}{'analysis'},
-feature_type => ${$self->{'feature_sets'}{$fset_name}{'feature_type'}},
);
($fset) = @{$self->db->get_FeatureSetAdaptor->store($fset)};
}
#Now replace config hash with object
$self->{feature_sets}{$fset_name} = $fset;
}
return;
}
# Base functionality for regualatory feature parsers
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
# Delete existing regulatory features etc # Delete existing regulatory features etc
#This needn't be done in line any more as we can just make the old set not displayable
sub delete_existing { sub delete_existing {
my ($db_adaptor, $type) = @_; my ($db_adaptor, $type) = @_;
throw('delete existing is deprecated');
my $t = lc($type); my $t = lc($type);
print "Deleting existing features & related data for type $type\n"; print "Deleting existing features & related data for type $type\n";
...@@ -52,27 +221,61 @@ sub delete_existing { ...@@ -52,27 +221,61 @@ sub delete_existing {
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
# Check that the type specified corresponds to an analysis # Check that the type specified corresponds to an analysis
sub validate_type { sub validate_and_store_feature_types{
my $self = shift;
my ($db_adaptor, $type) = @_;
# LBL enhancers have both positive and negative varieties # LBL enhancers have both positive and negative varieties
if (lc($type) eq 'enhancer') {
return (validate_type($db_adaptor, 'enhancer_positive') && validate_type($db_adaptor, 'enhancer_negative'));
}
my $sth = $db_adaptor->dbc->prepare("SELECT analysis_id FROM analysis WHERE LOWER(logic_name)=?"); #feature type class/naming is logical but not intuitive
#+-----------------+-------------------------+----------+----------------------------------------------------+
#| feature_type_id | name | class | description |
#+-----------------+-------------------------+----------+----------------------------------------------------+
#| 398680 | VISTA Enhancer | Enhancer | Enhancer identified by positive VISTA assay |
#| 398681 | VISTA Target - Negative | Region | Enhancer negative region identified by VISTA assay |
#+-----------------+-------------------------+----------+----------------------------------------------------+
$sth->execute(lc($type));
if ($sth->fetchrow_array()) {
print "Type $type is valid\n";
} else {
print "Type $type is not valid - is there an entry for $type in the analysis table?\n";
return 0;
}
return 1; #if (lc($type) eq 'VISTA') {
# return (validate_type($db_adaptor, 'VISTA Enhancer') && validate_type($db_adaptor, 'VISTA Target - Negative'));
#}
#my $sth = $self->db->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name=?");
#remove lc as mysql doesn't care about case
#$sth->execute($type);
#if ($sth->fetchrow_array()) {
# print "Type $type is valid\n";
#} else {
# print "Type $type is not valid - is there an entry for $type in the analysis table?\n";
# return 0;
#}
my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
foreach my $ftype_name(keys %{$self->{'feature_types'}}){
my $ftype = $ftype_adaptor->fetch_by_name($ftype_name);
if(! defined $ftype){
print ":: FeatureType '".$ftype_name."' for external feature_set ".$self->{'type'}." not present\n".
":: Storing using type hash definitions\n";
$ftype = Bio::EnsEMBL::Funcgen::FeatureType->new(
-name => $ftype_name,
-class => $self->{'feature_types'}{$ftype_name}{'class'},
-description => $self->{'feature_types'}{$ftype_name}{'description'},
);
($ftype) = @{$ftype_adaptor->store($ftype)};
}
#Replace hash config with object
$self->{'feature_types'}{$ftype_name} = $ftype;
}
return;
} }
...@@ -80,10 +283,11 @@ sub validate_type { ...@@ -80,10 +283,11 @@ sub validate_type {
# Find the maximum existing ID in a table. # Find the maximum existing ID in a table.
sub find_max_id { sub find_max_id {
my ($self, $db_adaptor, $table) = @_; my ($self, $table) = @_;
my $row = @{$db_adaptor->dbc->db_handle->selectall_arrayref("SELECT MAX(${table}_id) FROM $table")}[0]; my $row = @{$self->dbc->db_handle->selectall_arrayref("SELECT MAX(${table}_id) FROM $table")}[0];
my $max_id = @{$row}[0]; my $max_id = @{$row}[0];
if (!defined $max_id) { if (!defined $max_id) {
print "No existing ${table}_ids, will start from 1\n"; print "No existing ${table}_ids, will start from 1\n";
$max_id = 0; $max_id = 0;
...@@ -100,36 +304,46 @@ sub find_max_id { ...@@ -100,36 +304,46 @@ sub find_max_id {
# Return hashref keyed on {$type}{$stable_id} # Return hashref keyed on {$type}{$stable_id}
# Type is always all lower case # Type is always all lower case
sub build_stable_id_cache { sub build_display_name_cache {
my ($self, @types) = @_;
my ($self, $db_adaptor) = @_; my %display_name_cache;
my %stable_id_to_internal_id; #validate types here?
foreach my $type ('gene', 'transcript', 'translation') { # Add exon here if required foreach my $type (@types){#'gene', 'transcript', 'translation') { # Add exon here if required
print "Caching stable ID -> internal ID links for ${type}s\n"; #No display_xref for translations!!
throw('Cannot build cache for tranlsations, just use stable_id, check your implementatio') if $type eq 'translation';
print ":: Caching stable ID -> display_name links for ${type}s\n";
my $count = 0; my $count = 0;
my $sql;
my $sth = $db_adaptor->dbc->prepare("SELECT ${type}_id, stable_id FROM ${type}_stable_id"); print "dnadb is ".Data::Dumper::Dumper($self->db->dnadb->dbc);
$sth->execute();
my ($internal_id, $stable_id); #could do a fetchall_hashref here?
$sth->bind_columns(\$internal_id, \$stable_id); my $sth= $self->db->dnadb->dbc->prepare("SELECT t.stable_id, x.display_label FROM s.${type}_stable_id, t.${type} left join x.xref on t.display_xref_id=x.xref_id and s.${type}_id=t.${type}_id");
while ($sth->fetch) {
$stable_id_to_internal_id{$type}{$stable_id} = $internal_id; $sth->execute();
$count++; my ($stable_id, $display_name);
$sth->bind_columns(\$stable_id, \$display_name);
while ($sth->fetch) {
#removed type
$display_name_cache{$stable_id} = $display_name;
$count++ if $display_name;#will this be ! NULL?
} }
print "Got $count $type stable ID -> internal ID mappings\n"; print ":: Got $count $type stable ID -> display_name mappings\n";
} }
return \%stable_id_to_internal_id; return %display_name_cache;
} }
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
...@@ -137,9 +351,18 @@ sub build_stable_id_cache { ...@@ -137,9 +351,18 @@ sub build_stable_id_cache {
sub upload_features_and_factors { sub upload_features_and_factors {
my ($self, $db_adaptor, $objects) = @_; my ($self, $objects) = @_;
throw("upload_features_and_factors is deprecated");
my $dbc = $self->db->dbc;
my $dbc = $db_adaptor->dbc;
# we need to convert this to use the external_feature API
#this means we need to build external_features in the other modules and test for each RNA_feature_type/motif_feature_type?
#keep as feature_types for now, maybe next release?
my $feature_sth = $dbc->prepare("INSERT INTO regulatory_feature (regulatory_feature_id, name, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, analysis_id, regulatory_factor_id) VALUES(?,?,?,?,?,?,?,?)"); my $feature_sth = $dbc->prepare("INSERT INTO regulatory_feature (regulatory_feature_id, name, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, analysis_id, regulatory_factor_id) VALUES(?,?,?,?,?,?,?,?)");
my $factor_sth = $dbc->prepare("INSERT INTO regulatory_factor (regulatory_factor_id, name, type) VALUES(?,?,?)"); my $factor_sth = $dbc->prepare("INSERT INTO regulatory_factor (regulatory_factor_id, name, type) VALUES(?,?,?)");
...@@ -211,65 +434,88 @@ sub upload_features_and_factors { ...@@ -211,65 +434,88 @@ sub upload_features_and_factors {
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
# Project a feature from one slice to another # Project a feature from one slice to another
sub project_feature { sub project_feature {
my ($self, $feat, $new_assembly) = @_;
my ($self, $start, $end, $strand, $chr, $slice, $analysis, $new_assembly, $slice_adaptor, $label) = @_;
# just use a SimpleFeature for convenience # just use a SimpleFeature for convenience
my $feat = Bio::EnsEMBL::SimpleFeature->new #my $feat = Bio::EnsEMBL::SimpleFeature->new
(-start => $start, # (-start => $start,
-end => $end, # -end => $end,
-strand => $strand, # -strand => $strand,
-slice => $slice, # -slice => $slice,
-analysis => $analysis, # -analysis => $analysis,
-display_label => $label, # -display_label => $label,
-score => 0); # -score => 0);
# project feature to new assembly # project feature to new assembly
my $feat_slice = $feat->feature_Slice; my $feat_slice = $feat->feature_Slice;
my @segments = @{ $feat_slice->project('chromosome', $new_assembly) }; my @segments = @{ $feat_slice->project('chromosome', $new_assembly) };
next unless (@segments);
next if (scalar(@segments) > 1); #next what?
#next unless (@segments);
#next if (scalar(@segments) > 1);
if(! @segments){
print "Failed to project feature:\t".$feat->display_label."\n";
}
elsif(scalar(@segments) >1){
print "Failed to project feature to distinct location:\t".$feat->display_label."\n";
return;
}
my $proj_slice = $segments[0]->to_Slice; my $proj_slice = $segments[0]->to_Slice;
next unless ($feat_slice->length == $proj_slice->length);
if($feat_slice->length != $proj_slice->length){
print "Failed to project feature to comparable length region:\t".$feat->display_label."\n";
return;
}
# everything looks fine, so adjust the coords of the feature # everything looks fine, so adjust the coords of the feature
$feat->start($proj_slice->start); $feat->start($proj_slice->start);
$feat->end($proj_slice->end); $feat->end($proj_slice->end);
$feat->strand($proj_slice->strand); $feat->strand($proj_slice->strand);
my $slice_new_asm = $slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $new_assembly); my $slice_new_asm = $self->slice_adaptor->fetch_by_region('chromosome', $proj_slice->seq_region_name, undef, undef, undef, $new_assembly);
$feat->slice($slice_new_asm); $feat->slice($slice_new_asm);
return $feat; return $feat;
} }
sub slice_adaptor{
my $self = shift;
if(! defined $self->{'slice_adaptor'}){
$self->{'slice_adaptor'} = $self->db->get_SliceAdaptor;
}
return $self->{'slice_adaptor'};
}
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
sub get_blank_factor_id () { #sub get_blank_factor_id () {
my ($self, $db_adaptor) = @_; # my ($self, $db_adaptor) = @_;
my $sth = $db_adaptor->dbc->prepare("SELECT regulatory_factor_id FROM regulatory_factor WHERE name=''"); # my $sth = $db_adaptor->dbc->prepare("SELECT regulatory_factor_id FROM regulatory_factor WHERE name=''");
$sth->execute(); # $sth->execute();
my ($factor_id) = $sth->fetchrow_array(); # my ($factor_id) = $sth->fetchrow_array();
if ($factor_id) { # if ($factor_id) {
print "Found existing blank factor, id = $factor_id\n"; # print "Found existing blank factor, id = $factor_id\n";
} else { # } else {
$db_adaptor->dbc->do("INSERT INTO regulatory_factor (name) VALUES ('')"); # $db_adaptor->dbc->do("INSERT INTO regulatory_factor (name) VALUES ('')");
$sth->execute(); # $sth->execute();
($factor_id) = $sth->fetchrow_array(); # ($factor_id) = $sth->fetchrow_array();
print "Created new blank factor, id = $factor_id\n"; # print "Created new blank factor, id = $factor_id\n";
} # }
return $factor_id; # return $factor_id;
} #}
1; 1;
...@@ -10,13 +10,21 @@ use File::Basename; ...@@ -10,13 +10,21 @@ use File::Basename;
# #
# http://www.cisred.org/content/databases_methods/human_2/data_files/search_regions.txt # http://www.cisred.org/content/databases_methods/human_2/data_files/search_regions.txt
#No longer valid urls, now use the following for ensembl formats for all species:
#http://www.bcgsc.ca/downloads/cisred/temp/cisRED4Ensembl/
#naminf may not be obvious, may have to cross reference with above previous urls to get build info
# Format of motifs.txt (note group_name often blank) # Format of motifs.txt (note group_name often blank)
# name chromosome start end strand group_name #name chromosome start end strand group_name ensembl_gene
# craHsap1 X 138337029 138337034 1 #craHsap1 1 168129978 168129997 -1 1 ENSG00000000457
# craHsap2 X 138338145 138338150 1 #craHsap2 1 168129772 168129781 -1 2 ENSG00000000457
# craHsap3 X 138338363 138338368 1 #craHsap3 1 168129745 168129756 -1 3 ENSG00000000457
# craHsap4 X 138338388 138338395 1 #craHsap4 1 168129746 168129753 -1 4 ENSG00000000457
#craHsap5 1 168129745 168129752 -1 5 ENSG00000000457
#craHsap6 1 168129741 168129757 -1 6 ENSG00000000457
# Format of search_regions.txt # Format of search_regions.txt
# name chromosome start end strand ensembl_gene_id # name chromosome start end strand ensembl_gene_id
...@@ -26,284 +34,301 @@ use File::Basename; ...@@ -26,284 +34,301 @@ use File::Basename;
# 19 1 23602820 23605631 -1 ENSG00000007968 # 19 1 23602820 23605631 -1 ENSG00000007968
use Bio::EnsEMBL::DBSQL::DBAdaptor; use RegulatoryFeatureParser::BaseParser;
use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Funcgen::ExternalFeature;
use vars qw(@ISA); use vars qw(@ISA);
@ISA = qw(RegulatoryFeatureParser::BaseParser); @ISA = qw(RegulatoryFeatureParser::BaseParser);
# Parse file and return hashref containing:
#
# - arrayref of features
# - arrayref of factors
sub parse {
my ($self, $db_adaptor, $file, $old_assembly, $new_assembly) = @_;
my %result;
my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor(); sub new {
my $slice_adaptor = $db_adaptor->get_SliceAdaptor(); my $caller = shift;
my $class = ref($caller) || $caller;
my $stable_id_to_internal_id = $self->build_stable_id_cache($db_adaptor);
my $self = $class->SUPER::new(@_);
print "Parsing $file with cisred parser\n";
#Set default feature_type and feature_set config
# ---------------------------------------- $self->{'feature_types'} = {
# We need a "blank" factor for those features which aren't assigned factors 'cisRED Search Region' => {
# Done this way to maintain referential integrity class => 'Region',
description => 'cisRED search region',
my $blank_factor_id = $self->get_blank_factor_id($db_adaptor); },
'cisRED Motif' => {
# ---------------------------------------- class => 'Regulatory Motif',
description => 'cisRED group motif',
my $feature_internal_id = ($self->find_max_id($db_adaptor, "regulatory_feature")) + 1; },
my $highest_factor_id = ($self->find_max_id($db_adaptor, "regulatory_factor")) + 1; };
my @features; $self->{feature_sets} = {
my @factors; 'cisRED search regions' => {
my %factor_ids_by_name; # name -> factor_id feature_type => \$self->{'feature_types'}{'cisRED Search Region'},
my %feature_objects; analysis =>
{
-logic_name => 'cisRED',
-description => 'cisRED motif search (www.cisred.org)',
-display_label => 'cisRED',
-displayable => 1,
},
},
'cisRED group motifs' => {
feature_type => \$self->{'feature_types'}{'cisRED Motif'},
analysis =>
{
-logic_name => 'cisRED',
-description => 'cisRED motif search (www.cisred.org)',
-display_label => 'cisRED',
-displayable => 1,
},
},
};
$self->validate_and_store_feature_types;
$self->set_feature_sets;
# ---------------------------------------- return $self;
# Analysis - need one for each type of feature }
my %analysis;
foreach my $anal ("cisRed", "cisred_search") { # TODO - add other types as necessary
my $analysis_obj = $analysis_adaptor->fetch_by_logic_name($anal);
die "Can't get analysis for $anal, skipping" if (!$analysis_obj);
$analysis{$anal} = $analysis_obj->dbID();
print "Analysis ID for $anal is " . $analysis{$anal} . "\n";
} # Parse file and return hashref containing:
#
# - arrayref of features
# - arrayref of factors
sub parse_and_load {
my ($self, $file, $old_assembly, $new_assembly) = @_;
print ":: Parsing $file with cisRED parser\n";
my $analysis_adaptor = $self->db->get_AnalysisAdaptor();
my %features_by_group; # name -> factor_id
my %slice_cache;
my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor;
my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
my $display_name_cache = $self->build_display_name_cache('gene');
# this object is only used for projection # this object is only used for projection
my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisRedProjection'); my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisRedProjection');
# ---------------------------------------- # ----------------------------------------
# Parse motifs.txt file # We need a "blank" factor for those features which aren't assigned factors
# Done this way to maintain referential integrity
print "Parsing features from $file\n"; #my $blank_factor_id = $self->get_blank_factor_id($db_adaptor);
# Parse motifs.txt file
print ":: Parsing cisRED motifs from $file\n";
my $skipped = 0; my $skipped = 0;
my $skipped_xref = 0;
my $coords_changed = 0; #my $coords_changed = 0;
my $cnt = 0;
open (FILE, "<$file") || die "Can't open $file"; open (FILE, "<$file") || die "Can't open $file";
<FILE>; # skip header <FILE>; # skip header
while (<FILE>) {
while (<FILE>) {
next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/); next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/);
my %feature;
# name chromosome start end strand group_name ensembl_gene_id # name chromosome start end strand group_name ensembl_gene_id
my ($motif_name, $chromosome, $start, $end, $strand, $group_name, $gene_id) = split (/\t/); my ($motif_name, $chromosome, $start, $end, $strand, $group_name, $gene_id) = split (/\t/);
($gene_id) = $gene_id =~ /(ENS.*G\d{11})/; #($gene_id) = $gene_id =~ /(ENS.*G\d{11})/;
# ---------------------------------------- if(! exists $slice_cache{$chromosome}){
# Feature name & analysis
if($old_assembly){
$feature{NAME} = $motif_name; $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
$feature{INFLUENCE} = "unknown"; # TODO - what does cisRed store? $chromosome,
$feature{ANALYSIS_ID} = $analysis{cisRed}; undef,
undef,
# ---------------------------------------- undef,
# Factor $old_assembly);
}else{
# If $group_id is present in %group_sizes we want to create or reuse a factor. $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
# If not, this feature is not associated with any factor. }
# If the factor is to be created, its name is crtHsapXX where XX is group_id.
# TODO - other species prefixes if(! defined $slice_cache{$chromosome}){
warn "Can't get slice $chromosome for motif $motif_name\n";
$feature{FACTOR_ID} = $blank_factor_id; $skipped++
if ($group_name && $group_name ne '' && $group_name !~ /\s/) { next;
my $factor_id = $factor_ids_by_name{$group_name}; }
}
if (!$factor_id) { # create one
my %factor; #get feature_type first
$factor_id = $highest_factor_id + 1;
$factor{INTERNAL_ID} = $factor_id; #we are not maintaining this link in the DB!
$factor{NAME} = $group_name; #Do we need another xref for this or a different table?
$factor{TYPE} = 'NULL';
push @factors, \%factor;
$factor_ids_by_name{$factor{NAME}} = $factor_id; if ($group_name && $group_name ne '' && $group_name !~ /\s/) {
$highest_factor_id = $factor_id;
#print join(" ", ("Factor: ", $factor{ID}, $factor{NAME}, $factor{TYPE})) . "\n"; if(! exists $features_by_group{$group_name}){
} $features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name);
$feature{FACTOR_ID} = $factor_id;
} if(! defined $features_by_group{$group_name}){
($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
# ---------------------------------------- (
# Seq_region ID and co-ordinates, projected if necessary -name => 'crtHsap'.$group_name,
-class => 'Regulatory Motif',
my $chr_slice = $slice_adaptor->fetch_by_region('chromosome', $chromosome, undef, undef, undef, $old_assembly); -description => 'cisRED group motif',
))};
if (!$chr_slice) { }
print STDERR "Can't get slice for $chromosome:$start:$end\n"; }
next; }else{
} throw('Found cisRED feature $motif_name with no group_name, unable to defined feature_type');
}
my $seq_region_id = $slice_adaptor->get_seq_region_id($chr_slice);
my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new(
if (!$seq_region_id) { -display_label => $motif_name,
print STDERR "Can't get seq_region_id for chromosome $chromosome\n"; -start => $start,
next; -end => $end,
} -strand => $strand,
-feature_type => $features_by_group{$group_name}
$feature{SEQ_REGION_ID} = $seq_region_id; -feature_set => $self->{'feature_sets'}{'cisRED group motifs'},
-slice => $slice_cache{$chromosome},
);
# project if necessary # project if necessary
if ($new_assembly) { if ($new_assembly) {
$feature = $self->project_feature($feature, $new_assembly);
#print join("\t", "OLD: ", $start, $end, $strand, $chromosome, $motif_name) . "\n"; if(! defined $feature){
$skipped ++;
my $projected_feature = $self->project_feature($start, $end, $strand, $chromosome, $chr_slice, $dummy_analysis, $new_assembly, $slice_adaptor, $motif_name); next;
}
$coords_changed++ if ($projected_feature->start() != $start || $projected_feature->end() != $end);
$start = $projected_feature->start();
$end = $projected_feature->end();
$strand = $projected_feature->strand();
$feature{SEQ_REGION_ID} = $slice_adaptor->get_seq_region_id($projected_feature->feature_Slice); #$coords_changed++ if ($projected_feature->start() != $start || $projected_feature->end() != $end);
#print join("\t", "NEW: ", $start, $end, $strand, $chromosome, $motif_name) . "\n";
}
$feature{START} = $start;
$feature{END} = $end;
$feature{STRAND} = $strand;
# ----------------------------------------
# Ensembl object - always a gene in cisRed
my $ensembl_id = $stable_id_to_internal_id->{gene}->{$gene_id};
if (!$ensembl_id) {
print STDERR "Can't get ensembl internal ID for $gene_id, skipping\n";
print join("-", $motif_name, $chromosome, $start, $end, $strand, $group_name, $gene_id, "\n");
$skipped++;
next;
} }
$feature{ENSEMBL_TYPE} = "Gene"; ($feature) = @{$extf_adaptor->store($feature)};
$feature{ENSEMBL_ID} = $ensembl_id; $cnt++;
# ---------------------------------------- #Build Xref here
# Feature internal ID if($gene_id){
$feature{INTERNAL_ID} = $feature_internal_id++; #need to take xref core dbname as a parameter
#defaulting to current db at present
# ----------------------------------------
# Evidence #should this not have some 'gene' e.g. core_gene?
$feature{EVIDENCE} = ""; if(! exists $display_name_cache->{$gene_id}){
warn "Cannot get ensembl gene id $gene_id for motif $motif_name\n";
# ---------------------------------------- $skipped_xref++;
# Add to object to be returned next;
}
push @features, \%feature;
my $dbentry = Bio::EnsEMBL::DBEntry->new(
-dbname => 'core',
-release => $self->db->dnadb->dbname,
-status => 'KNOWNXREF',
-display_label_linkable => 1,
-db_display_name => $self->db->dnadb->dbname,
-type => 'MISC',
-dbprimary_acc => $gene_id,
-display_name => $display_name_cache->{$gene_id},
-info_type => 'MISC'
#could have version here if we use the correct dnadb to build the cache
);
$dbentry_adaptor->store($dbentry);
}
}
close FILE;
#print "Feature: "; print ":: Stored $cnt cisRED ExternalFeature motif\n";
#foreach my $field (keys %feature) { print ":: Skipped $skipped cisRED ExternalFeature motif imports\n";
# print $field . ": " . $feature{$field} . " "; print ":: Skipped an additional $skipped_xref DBEntry imports\n";
#}
#print "\n";
}
close FILE;
# ---------------------------------------- # ----------------------------------------
# Search regions # Search regions
# read search_regions.txt from same location as $file # read search_regions.txt from same location as $file
my $search_regions_file = dirname($file) . "/search_regions.txt";
my $skipped_sr = 0; my $search_regions_file = dirname($file) . "/search_regions.txt";
$skipped = 0;
my @search_regions; $cnt = 0;
print "Parsing search regions from $search_regions_file\n";
print ":: Parsing cisRED search regions from $search_regions_file\n";
open (SEARCH_REGIONS, "<$search_regions_file") || die "Can't open $search_regions_file"; open (SEARCH_REGIONS, "<$search_regions_file") || die "Can't open $search_regions_file";
<SEARCH_REGIONS>; # skip header <SEARCH_REGIONS>; # skip header
while (<SEARCH_REGIONS>) { while (<SEARCH_REGIONS>) {
chomp; chomp;
my ($id, $chromosome, $start, $end, $strand, $ensembl_gene_id) = split; my ($id, $chromosome, $start, $end, $strand, $ensembl_gene_id) = split;
my $gene_id = $stable_id_to_internal_id->{gene}->{$ensembl_gene_id}; my $gene_id = $display_name_cache->{gene}->{$ensembl_gene_id};
my $name = "CisRed_Search_$id";
if (!$gene_id) { if (!$gene_id) {
warn("Can't get internal ID for $ensembl_gene_id\n"); warn("Can't get internal ID for $ensembl_gene_id\n");
$skipped_sr++; $skipped++;
next;
}
my $sr_chr_slice = $slice_adaptor->fetch_by_region('chromosome', $chromosome, undef, undef, undef, $old_assembly);
if (!$sr_chr_slice) {
print STDERR "Can't get slice for $chromosome:$start:$end\n";
next;
}
my $sr_seq_region_id = $slice_adaptor->get_seq_region_id($sr_chr_slice);
if (!$sr_seq_region_id) {
print STDERR "Can't get seq_region_id for chromosome $chromosome\n";
next; next;
} }
my $name = "CisRed_Search_$id"; if(! exists $slice_cache{$chromosome}){
if($old_assembly){
$slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
$chromosome,
undef,
undef,
undef,
$old_assembly);
}else{
$slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
}
if(! defined $slice_cache{$chromosome}){
warn "Can't get slice $chromosome for for search region $name\n";
next;
}
}
my $search_feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
(
-display_label => $name,
-start => $start,
-end => $end,
-strand => $strand,
-feature_type => $self->{'feature_sets'}{'cisRED group motifs'}->feature_type,
-feature_set => $self->{'feature_sets'}{'cisRED group motifs'},
-slice => $slice_cache{$chromosome},
);
# project if necessary # project if necessary
if ($new_assembly) { if ($new_assembly) {
$search_feature = $self->project_feature($search_feature);
#print join("\t", "OLD: ", $start, $end, $strand, $chromosome, $name) . "\n";
if(! defined $search_feature){
my $projected_region = $self->project_feature($start, $end, $strand, $chromosome, $sr_chr_slice, $dummy_analysis, $new_assembly, $slice_adaptor, "CisRed_Search_$id"); $skipped ++;
next;
$start = $projected_region->start(); }
$end = $projected_region->end(); }
$strand = $projected_region->strand();
$extf_adaptor->store($search_feature);
#print join("\t", "NEW: ", $start, $end, $strand, $chromosome, $name) . "\n";
}
my %search_region;
$search_region{NAME} = $name;
$search_region{SEQ_REGION_ID} = $sr_seq_region_id;
$search_region{START} = $start;
$search_region{END} = $end;
$search_region{STRAND} = ($strand =~ /\+/ ? 1 : -1);
$search_region{ENSEMBL_OBJECT_TYPE} = 'Gene';
$search_region{ENSEMBL_OBJECT_ID} = $gene_id;
$search_region{ANALYSIS_ID} = $analysis{cisred_search};
push @search_regions, \%search_region;
} }
close(SEARCH_REGIONS);
# ----------------------------------------
$result{FEATURES} = \@features; close(SEARCH_REGIONS);
$result{FACTORS} = \@factors;
$result{SEARCH_REGIONS} = \@search_regions;
print "Parsed " . scalar(@{$result{FEATURES}}) . " features, " . scalar(@{$result{FACTORS}}) . " factors and " . scalar(@{$result{SEARCH_REGIONS}}) . " search regions\n";
print "Skipped $skipped features and $skipped_sr search regions due to missing stable ID - internal ID mappings\n";
print ":: Stored $cnt cisRED search region ExternalFeatures\n";
print ":: Skipped $skipped cisRED search region ExternalFeatures\n";
print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly); #print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly);
return \%result; return;
} }
sub new {
my $self = {};
bless $self, "RegulatoryFeatureParser::cisred";
return $self;
}
1; 1;
...@@ -8,7 +8,9 @@ use strict; ...@@ -8,7 +8,9 @@ use strict;
# Similarity hsa-miR-23a miRanda miRNA_target 1 919787 919807 + . 71 transcript id "ENST00000310998" # Similarity hsa-miR-23a miRanda miRNA_target 1 919787 919807 + . 71 transcript id "ENST00000310998"
use RegulatoryFeatureParser::BaseParser; use RegulatoryFeatureParser::BaseParser;
use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Funcgen::ExternalFeature;
use vars qw(@ISA); use vars qw(@ISA);
@ISA = qw(RegulatoryFeatureParser::BaseParser); @ISA = qw(RegulatoryFeatureParser::BaseParser);
...@@ -18,44 +20,94 @@ use vars qw(@ISA); ...@@ -18,44 +20,94 @@ use vars qw(@ISA);
# - arrayref of features # - arrayref of features
# - arrayref of factors # - arrayref of factors
sub parse {
my ($self, $db_adaptor, $file, $old_assembly, $new_assembly) = @_;
my %result;
print "Parsing $file with miranda parser\n"; sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = $class->SUPER::new(@_);
#Set default feature_type and feature_set config
$self->{'feature_types'} = {
'miRanda' => {
class => 'RNA',
description => 'miRanda microRNA',
},
};
$self->{feature_sets} = {
'miRanda miRNA' => {
feature_type => \$self->{'feature_types'}{'cisRED Search Region'},
analysis =>
{
-logic_name => 'miRanda',
-description => 'miRanda microRNA target prediction (http://cbio.mskcc.org/mirnaviewer/)',
-display_label => 'miRanda',
-displayable => 1,
},
},
};
$self->validate_and_store_feature_types;
$self->set_feature_sets;
my $feature_internal_id = ($self->find_max_id($db_adaptor, "regulatory_feature")) + 1; return $self;
my $highest_factor_id = ($self->find_max_id($db_adaptor, "regulatory_factor")) + 1; }
my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor();
my $slice_adaptor = $db_adaptor->get_SliceAdaptor();
my @features;
my @factors;
my %factor_ids_by_name; # name -> factor_id
my %feature_objects;
my %anal;
# TODO - regulatory_factor_coding
my $stable_id_to_internal_id = $self->build_stable_id_cache($db_adaptor); sub parse_and_load{
my ($self, $file, $old_assembly, $new_assembly) = @_;
print ":: Parsing miRanda data from:\t$file\n";
my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor();
my %features_by_name; # name -> feature_type
my %slice_cache;
my $display_name_cache = $self->build_display_name_cache('gene');
# this object is only used for projection # this object is only used for projection
my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisRedProjection'); my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'miRandaProjection');
my $skipped = 0;
open (FILE, "<$file") || die "Can't open $file"; open (FILE, "<$file") || die "Can't open $file";
while (<FILE>) { while (<FILE>) {
next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/);
next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/);
my %feature;
my ($group, $seq, $method, $feature, $chr, $start, $end, $str, $phase, $score, $pvalue, $type, $id_ignore, $id) = split; my ($group, $seq, $method, $feature, $chr, $start, $end, $str, $phase, $score, $pvalue, $type, $id_ignore, $id) = split;
my $strand = ($str =~ /\+/ ? 1 : -1); my $strand = ($str =~ /\+/ ? 1 : -1);
$id =~ s/[\"\']//g; # strip quotes $id =~ s/[\"\']//g; # strip quotes
$id .= ':'.$seq;
if(! exists $slice_cache{$chromosome}){
if($old_assembly){
$slice_cache{$chr} = $self->slice_adaptor->fetch_by_region('chromosome',
$chr,
undef,
undef,
undef,
$old_assembly);
}else{
$slice_cache{$chr} = $self->slice_adaptor->fetch_by_region('chromosome', $chr);
}
if(! defined $slice_cache{$chr}){
warn "Can't get slice $chr for sequence $id\n";
$skipped++;
next;
}
}
# ---------------------------------------- # ----------------------------------------
# Feature name # Feature name
...@@ -63,12 +115,25 @@ sub parse { ...@@ -63,12 +115,25 @@ sub parse {
# For miRNA_target, individual features don't have a unique name, so create # For miRNA_target, individual features don't have a unique name, so create
# a composite one. Also set influence. # a composite one. Also set influence.
$feature{NAME} = $id .":" . $seq;
$feature{INFLUENCE} = "negative"; $feature{INFLUENCE} = "negative";
# ---------------------------------------- # ----------------------------------------
# Factor # Factor
if(! exists $features_by_group{$group_name}){
$features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name);
if(! defined $features_by_group{$group_name}){
($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
(
-name => 'crtHsap'.$group_name,
-class => 'Regulatory Motif',
-description => 'cisRED group motif',
))};
}
}
# $seq is the name of a factor - if it's already there, find its ID, otherwise add it # $seq is the name of a factor - if it's already there, find its ID, otherwise add it
my $factor_id = $factor_ids_by_name{$seq}; my $factor_id = $factor_ids_by_name{$seq};
if (!$factor_id) { if (!$factor_id) {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment