diff --git a/misc-scripts/regulatory_regions/RegulatoryFeatureParser/BaseParser.pm b/misc-scripts/regulatory_regions/RegulatoryFeatureParser/BaseParser.pm index fc84107dcbaea9852cf484d225d7fe9c692dcfee..fc5bc7b464ddd45de606484549f6bba1d52eff52 100644 --- a/misc-scripts/regulatory_regions/RegulatoryFeatureParser/BaseParser.pm +++ b/misc-scripts/regulatory_regions/RegulatoryFeatureParser/BaseParser.pm @@ -4,18 +4,187 @@ use strict; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::SimpleFeature; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::FeatureType; +use Bio::EnsEMBL::Analysis; + +# Base functionality for external_feature parsers + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = {}; + + #my $self = {( +# feature_sets => {( +# vista => {('VISTA enhancer set' => undef)}, +# cisred => {( +# 'cisRED search regions' => undef, +# 'cisRED group motifs' => undef, +# )}, +# miranda => {('miRanda miRNA' => undef)}, +# )}, +# )}; + bless $self, $class; + + #validate and set type, analysis and feature_set here + my ($type, $db, $clobber, $archive) = rearrange(['TYPE', 'DB', 'CLOBBER', 'ARCHIVE'], @_); + + throw('You must define a type of external_feature to import') if(! defined $type); + + if (! ($db && ref($db) && + $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + + throw('You can only specify either -clobber or -archive, but not both') if($clobber && $archive); + + + $self->{'db'} = $db; + $self->{type} = $type; + $self->{'clobber'} = $clobber if defined $clobber; + $self->{'archive'} = $archive if defined $archive; + + print ":: Parsing and loading $type ExternalFeatures"; + + return $self; + +} + +sub db{ + my ($self, $db) = @_; + if($db){ + + if(! $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor')){ + throw('You must prove a Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + + $self->{'db'} = $db; + } + + return $self->{'db'}; +} + + +sub set_feature_sets{ + my $self = shift; + + throw('Must provide a set feature_set config hash') if ! defined $self->{'feature_sets'}; + + + my $fset_adaptor = $self->db->get_FeatureSetAdaptor; + my $analysis_adaptor = $self->db->get_AnalysisAdaptor; + + foreach my $fset_name(keys %{$self->{feature_sets}}){ + + #check feature_type is validated? + + my $fset = $fset_adaptor->fetch_by_name($fset_name); + + #what about data sets? + #we don't need data sets for external_sets! + + if(defined $fset){ + + if($self->{'clobber'}){ + #Need to clobber any DBEntries first!!! + + if($self->{'type'} eq 'cisred'){ + my @ext_feat_ids = @{$self->db->dbc->db_handle->selectall_arrayref('select external_feature_id from external_feature where feature_set_id='.$fset->dbID)}; + + if(@ext_feat_ids){ + + my ($core_ext_dbid) = @{$self->db->dbc->db_handle->selectall_arrayref('select external_db_id from external_db where db_name="core"')}; + + if($core_ext_dbid){ + #double table delete? + my $sql = "delete x, ox from object_xref ox, xref x where ox.ensembl_object_type='ExternalFeature' and x.external_db_id=$core_ext_dbid and ox.xref_id=x.xref_id and ensembl_id in(".join(', ', @ext_feat_ids).')'; + print ":: Clobbering xrefs for $fset_name\n"; + $self->db->dbc->do($sql); + } + } + + print ":: Clobbering old features for external feature_set:\t$fset_name\n"; + my $sql = 'delete from external_feature where feature_set_id='.$fset->dbID; + $self->db->dbc->do($sql); + } + + } + elsif($self->{'archive'}){ + my $archive_fset = $fset_adaptor->fetch_by_name($fset_name."_v".$self->{'archive'}); + + if(defined $archive_fset){ + throw("You are trying to create an archive external feature_set which already exists:\t${fset_name}_v".$self->{archive}); + } + + my $sql = "UPDATE feature_set set name='$fset_name}_v".$self->{archive}."' where name='$fset_name'"; + $self->db->dbc->do($sql); + undef $fset; + }else{ + throw("You are trying to create an external feature_set which already exists:\t$fset_name\nMaybe to want to clobber or archive?"); + } + } + + if(!defined $fset){ + #don't need to use RNAFeatureType here as this is the setwide generic feature_type + #or do we have separate tables for external_feature and external_rna_feature? + + #validate analysis first + my $analysis = $analysis_adaptor->fetch_by_logic_name($self->{'feature_sets'}{$fset_name}{'analysis'}{'-logic_name'}); + if(! defined $analysis){ + + print ':: Analysis '.$self->{'feature_sets'}{$fset_name}{'analysis'}{'-logic_name'}. + " not found, storing from config hash\n"; + + $analysis_adaptor->store(Bio::EnsEMBL::Analysis->new + ( + %{$self->{'feature_sets'}{$fset_name}{'analysis'}} + #-logic_name => $self->{'feature_sets'}{$fset_name}{'analysis'}{'logic_name'}, + #-description => $self->{'feature_sets'}{$fset_name}{'analysis'}{'description'}, + #-display_label => $self->{'feature_sets'}{$fset_name}{'analysis'}{'display_label'}, + #-diplayable => $self->{'feature_sets'}{$fset_name}{'analysis'}{'displayable'}, + )); + + $analysis = $analysis_adaptor->fetch_by_logic_name($self->{'feature_sets'}{$fset_name}{'analysis'}); + } + + #replace hash config with object + $self->{'feature_sets'}{$fset_name}{'analysis'} = $analysis; + + $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new( + -name => $fset_name, + -type => 'external', + -analysis => $self->{'feature_sets'}{$fset_name}{'analysis'}, + -feature_type => ${$self->{'feature_sets'}{$fset_name}{'feature_type'}}, + ); + + ($fset) = @{$self->db->get_FeatureSetAdaptor->store($fset)}; + } + + #Now replace config hash with object + $self->{feature_sets}{$fset_name} = $fset; + } + + return; +} -# Base functionality for regualatory feature parsers # -------------------------------------------------------------------------------- # Delete existing regulatory features etc +#This needn't be done in line any more as we can just make the old set not displayable + sub delete_existing { my ($db_adaptor, $type) = @_; + + throw('delete existing is deprecated'); + my $t = lc($type); print "Deleting existing features & related data for type $type\n"; @@ -52,27 +221,61 @@ sub delete_existing { # -------------------------------------------------------------------------------- # Check that the type specified corresponds to an analysis -sub validate_type { - - my ($db_adaptor, $type) = @_; +sub validate_and_store_feature_types{ + my $self = shift; # LBL enhancers have both positive and negative varieties - if (lc($type) eq 'enhancer') { - return (validate_type($db_adaptor, 'enhancer_positive') && validate_type($db_adaptor, 'enhancer_negative')); - } - my $sth = $db_adaptor->dbc->prepare("SELECT analysis_id FROM analysis WHERE LOWER(logic_name)=?"); + #feature type class/naming is logical but not intuitive + #+-----------------+-------------------------+----------+----------------------------------------------------+ + #| feature_type_id | name | class | description | + #+-----------------+-------------------------+----------+----------------------------------------------------+ + #| 398680 | VISTA Enhancer | Enhancer | Enhancer identified by positive VISTA assay | + #| 398681 | VISTA Target - Negative | Region | Enhancer negative region identified by VISTA assay | + #+-----------------+-------------------------+----------+----------------------------------------------------+ - $sth->execute(lc($type)); - if ($sth->fetchrow_array()) { - print "Type $type is valid\n"; - } else { - print "Type $type is not valid - is there an entry for $type in the analysis table?\n"; - return 0; - } - return 1; + #if (lc($type) eq 'VISTA') { + # return (validate_type($db_adaptor, 'VISTA Enhancer') && validate_type($db_adaptor, 'VISTA Target - Negative')); + #} + + #my $sth = $self->db->dbc->prepare("SELECT analysis_id FROM analysis WHERE logic_name=?"); + + + + #remove lc as mysql doesn't care about case + #$sth->execute($type); + #if ($sth->fetchrow_array()) { + # print "Type $type is valid\n"; + #} else { + # print "Type $type is not valid - is there an entry for $type in the analysis table?\n"; + # return 0; + #} + + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + + foreach my $ftype_name(keys %{$self->{'feature_types'}}){ + + my $ftype = $ftype_adaptor->fetch_by_name($ftype_name); + + if(! defined $ftype){ + print ":: FeatureType '".$ftype_name."' for external feature_set ".$self->{'type'}." not present\n". + ":: Storing using type hash definitions\n"; + + $ftype = Bio::EnsEMBL::Funcgen::FeatureType->new( + -name => $ftype_name, + -class => $self->{'feature_types'}{$ftype_name}{'class'}, + -description => $self->{'feature_types'}{$ftype_name}{'description'}, + ); + ($ftype) = @{$ftype_adaptor->store($ftype)}; + } + + #Replace hash config with object + $self->{'feature_types'}{$ftype_name} = $ftype; + } + + return; } @@ -80,10 +283,11 @@ sub validate_type { # Find the maximum existing ID in a table. sub find_max_id { - my ($self, $db_adaptor, $table) = @_; + my ($self, $table) = @_; - my $row = @{$db_adaptor->dbc->db_handle->selectall_arrayref("SELECT MAX(${table}_id) FROM $table")}[0]; + my $row = @{$self->dbc->db_handle->selectall_arrayref("SELECT MAX(${table}_id) FROM $table")}[0]; my $max_id = @{$row}[0]; + if (!defined $max_id) { print "No existing ${table}_ids, will start from 1\n"; $max_id = 0; @@ -100,36 +304,46 @@ sub find_max_id { # Return hashref keyed on {$type}{$stable_id} # Type is always all lower case -sub build_stable_id_cache { +sub build_display_name_cache { + + my ($self, @types) = @_; - my ($self, $db_adaptor) = @_; + my %display_name_cache; - my %stable_id_to_internal_id; + #validate types here? - foreach my $type ('gene', 'transcript', 'translation') { # Add exon here if required + foreach my $type (@types){#'gene', 'transcript', 'translation') { # Add exon here if required - print "Caching stable ID -> internal ID links for ${type}s\n"; + #No display_xref for translations!! + throw('Cannot build cache for tranlsations, just use stable_id, check your implementatio') if $type eq 'translation'; + + print ":: Caching stable ID -> display_name links for ${type}s\n"; my $count = 0; + my $sql; - my $sth = $db_adaptor->dbc->prepare("SELECT ${type}_id, stable_id FROM ${type}_stable_id"); - $sth->execute(); - my ($internal_id, $stable_id); - $sth->bind_columns(\$internal_id, \$stable_id); + print "dnadb is ".Data::Dumper::Dumper($self->db->dnadb->dbc); + + #could do a fetchall_hashref here? + my $sth= $self->db->dnadb->dbc->prepare("SELECT t.stable_id, x.display_label FROM s.${type}_stable_id, t.${type} left join x.xref on t.display_xref_id=x.xref_id and s.${type}_id=t.${type}_id"); - while ($sth->fetch) { + - $stable_id_to_internal_id{$type}{$stable_id} = $internal_id; - $count++; + $sth->execute(); + my ($stable_id, $display_name); + $sth->bind_columns(\$stable_id, \$display_name); + while ($sth->fetch) { + #removed type + $display_name_cache{$stable_id} = $display_name; + $count++ if $display_name;#will this be ! NULL? } - print "Got $count $type stable ID -> internal ID mappings\n"; + print ":: Got $count $type stable ID -> display_name mappings\n"; } - return \%stable_id_to_internal_id; - + return %display_name_cache; } # -------------------------------------------------------------------------------- @@ -137,9 +351,18 @@ sub build_stable_id_cache { sub upload_features_and_factors { - my ($self, $db_adaptor, $objects) = @_; + my ($self, $objects) = @_; + + + throw("upload_features_and_factors is deprecated"); + + my $dbc = $self->db->dbc; + - my $dbc = $db_adaptor->dbc; + + # we need to convert this to use the external_feature API + #this means we need to build external_features in the other modules and test for each RNA_feature_type/motif_feature_type? + #keep as feature_types for now, maybe next release? my $feature_sth = $dbc->prepare("INSERT INTO regulatory_feature (regulatory_feature_id, name, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, analysis_id, regulatory_factor_id) VALUES(?,?,?,?,?,?,?,?)"); my $factor_sth = $dbc->prepare("INSERT INTO regulatory_factor (regulatory_factor_id, name, type) VALUES(?,?,?)"); @@ -211,65 +434,88 @@ sub upload_features_and_factors { # -------------------------------------------------------------------------------- # Project a feature from one slice to another sub project_feature { - - my ($self, $start, $end, $strand, $chr, $slice, $analysis, $new_assembly, $slice_adaptor, $label) = @_; + my ($self, $feat, $new_assembly) = @_; # just use a SimpleFeature for convenience - my $feat = Bio::EnsEMBL::SimpleFeature->new - (-start => $start, - -end => $end, - -strand => $strand, - -slice => $slice, - -analysis => $analysis, - -display_label => $label, - -score => 0); + #my $feat = Bio::EnsEMBL::SimpleFeature->new + # (-start => $start, + # -end => $end, + # -strand => $strand, + # -slice => $slice, + # -analysis => $analysis, + # -display_label => $label, + # -score => 0); # project feature to new assembly my $feat_slice = $feat->feature_Slice; my @segments = @{ $feat_slice->project('chromosome', $new_assembly) }; - next unless (@segments); - next if (scalar(@segments) > 1); + #next what? + #next unless (@segments); + #next if (scalar(@segments) > 1); + + if(! @segments){ + print "Failed to project feature:\t".$feat->display_label."\n"; + } + elsif(scalar(@segments) >1){ + print "Failed to project feature to distinct location:\t".$feat->display_label."\n"; + return; + } my $proj_slice = $segments[0]->to_Slice; - next unless ($feat_slice->length == $proj_slice->length); + + if($feat_slice->length != $proj_slice->length){ + print "Failed to project feature to comparable length region:\t".$feat->display_label."\n"; + return; + } # everything looks fine, so adjust the coords of the feature $feat->start($proj_slice->start); $feat->end($proj_slice->end); $feat->strand($proj_slice->strand); - my $slice_new_asm = $slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $new_assembly); + my $slice_new_asm = $self->slice_adaptor->fetch_by_region('chromosome', $proj_slice->seq_region_name, undef, undef, undef, $new_assembly); $feat->slice($slice_new_asm); return $feat; } +sub slice_adaptor{ + my $self = shift; + + if(! defined $self->{'slice_adaptor'}){ + $self->{'slice_adaptor'} = $self->db->get_SliceAdaptor; + } + + return $self->{'slice_adaptor'}; +} + + # -------------------------------------------------------------------------------- -sub get_blank_factor_id () { +#sub get_blank_factor_id () { - my ($self, $db_adaptor) = @_; +# my ($self, $db_adaptor) = @_; - my $sth = $db_adaptor->dbc->prepare("SELECT regulatory_factor_id FROM regulatory_factor WHERE name=''"); - $sth->execute(); +# my $sth = $db_adaptor->dbc->prepare("SELECT regulatory_factor_id FROM regulatory_factor WHERE name=''"); +# $sth->execute(); - my ($factor_id) = $sth->fetchrow_array(); +# my ($factor_id) = $sth->fetchrow_array(); - if ($factor_id) { - print "Found existing blank factor, id = $factor_id\n"; - } else { - $db_adaptor->dbc->do("INSERT INTO regulatory_factor (name) VALUES ('')"); - $sth->execute(); - ($factor_id) = $sth->fetchrow_array(); - print "Created new blank factor, id = $factor_id\n"; - } +# if ($factor_id) { +# print "Found existing blank factor, id = $factor_id\n"; +# } else { +# $db_adaptor->dbc->do("INSERT INTO regulatory_factor (name) VALUES ('')"); +# $sth->execute(); +# ($factor_id) = $sth->fetchrow_array(); +# print "Created new blank factor, id = $factor_id\n"; +# } - return $factor_id; +# return $factor_id; -} +#} 1; diff --git a/misc-scripts/regulatory_regions/RegulatoryFeatureParser/cisred.pm b/misc-scripts/regulatory_regions/RegulatoryFeatureParser/cisred.pm index a4b52f7ce8dc1d394ca83fb7b09d1f1f4e5669cd..70ba69a7c9118a2273e3a8e92d6845ac77e75427 100644 --- a/misc-scripts/regulatory_regions/RegulatoryFeatureParser/cisred.pm +++ b/misc-scripts/regulatory_regions/RegulatoryFeatureParser/cisred.pm @@ -10,13 +10,21 @@ use File::Basename; # # http://www.cisred.org/content/databases_methods/human_2/data_files/search_regions.txt + +#No longer valid urls, now use the following for ensembl formats for all species: +#http://www.bcgsc.ca/downloads/cisred/temp/cisRED4Ensembl/ +#naminf may not be obvious, may have to cross reference with above previous urls to get build info + # Format of motifs.txt (note group_name often blank) -# name chromosome start end strand group_name -# craHsap1 X 138337029 138337034 1 -# craHsap2 X 138338145 138338150 1 -# craHsap3 X 138338363 138338368 1 -# craHsap4 X 138338388 138338395 1 +#name chromosome start end strand group_name ensembl_gene +#craHsap1 1 168129978 168129997 -1 1 ENSG00000000457 +#craHsap2 1 168129772 168129781 -1 2 ENSG00000000457 +#craHsap3 1 168129745 168129756 -1 3 ENSG00000000457 +#craHsap4 1 168129746 168129753 -1 4 ENSG00000000457 +#craHsap5 1 168129745 168129752 -1 5 ENSG00000000457 +#craHsap6 1 168129741 168129757 -1 6 ENSG00000000457 + # Format of search_regions.txt # name chromosome start end strand ensembl_gene_id @@ -26,284 +34,301 @@ use File::Basename; # 19 1 23602820 23605631 -1 ENSG00000007968 -use Bio::EnsEMBL::DBSQL::DBAdaptor; +use RegulatoryFeatureParser::BaseParser; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; use vars qw(@ISA); @ISA = qw(RegulatoryFeatureParser::BaseParser); -# Parse file and return hashref containing: -# -# - arrayref of features -# - arrayref of factors -sub parse { - my ($self, $db_adaptor, $file, $old_assembly, $new_assembly) = @_; - my %result; - my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor(); - my $slice_adaptor = $db_adaptor->get_SliceAdaptor(); - - my $stable_id_to_internal_id = $self->build_stable_id_cache($db_adaptor); - - print "Parsing $file with cisred parser\n"; - - # ---------------------------------------- - # We need a "blank" factor for those features which aren't assigned factors - # Done this way to maintain referential integrity - - my $blank_factor_id = $self->get_blank_factor_id($db_adaptor); - - # ---------------------------------------- - - my $feature_internal_id = ($self->find_max_id($db_adaptor, "regulatory_feature")) + 1; - my $highest_factor_id = ($self->find_max_id($db_adaptor, "regulatory_factor")) + 1; - - my @features; - my @factors; - my %factor_ids_by_name; # name -> factor_id - my %feature_objects; +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #Set default feature_type and feature_set config + $self->{'feature_types'} = { + 'cisRED Search Region' => { + class => 'Region', + description => 'cisRED search region', + }, + 'cisRED Motif' => { + class => 'Regulatory Motif', + description => 'cisRED group motif', + }, + }; + + $self->{feature_sets} = { + 'cisRED search regions' => { + feature_type => \$self->{'feature_types'}{'cisRED Search Region'}, + analysis => + { + -logic_name => 'cisRED', + -description => 'cisRED motif search (www.cisred.org)', + -display_label => 'cisRED', + -displayable => 1, + }, + }, + 'cisRED group motifs' => { + feature_type => \$self->{'feature_types'}{'cisRED Motif'}, + analysis => + { + -logic_name => 'cisRED', + -description => 'cisRED motif search (www.cisred.org)', + -display_label => 'cisRED', + -displayable => 1, + }, + }, + }; + + + + $self->validate_and_store_feature_types; + $self->set_feature_sets; - # ---------------------------------------- - # Analysis - need one for each type of feature - my %analysis; + return $self; +} - foreach my $anal ("cisRed", "cisred_search") { # TODO - add other types as necessary - my $analysis_obj = $analysis_adaptor->fetch_by_logic_name($anal); - die "Can't get analysis for $anal, skipping" if (!$analysis_obj); - $analysis{$anal} = $analysis_obj->dbID(); - print "Analysis ID for $anal is " . $analysis{$anal} . "\n"; - } +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors +sub parse_and_load { + my ($self, $file, $old_assembly, $new_assembly) = @_; + print ":: Parsing $file with cisRED parser\n"; + + my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); + my %features_by_group; # name -> factor_id + my %slice_cache; + my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + my $display_name_cache = $self->build_display_name_cache('gene'); # this object is only used for projection my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisRedProjection'); # ---------------------------------------- - # Parse motifs.txt file - - print "Parsing features from $file\n"; + # We need a "blank" factor for those features which aren't assigned factors + # Done this way to maintain referential integrity + #my $blank_factor_id = $self->get_blank_factor_id($db_adaptor); + # Parse motifs.txt file + print ":: Parsing cisRED motifs from $file\n"; my $skipped = 0; - - my $coords_changed = 0; + my $skipped_xref = 0; + #my $coords_changed = 0; + my $cnt = 0; open (FILE, "<$file") || die "Can't open $file"; <FILE>; # skip header - while (<FILE>) { + while (<FILE>) { next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/); - my %feature; # name chromosome start end strand group_name ensembl_gene_id my ($motif_name, $chromosome, $start, $end, $strand, $group_name, $gene_id) = split (/\t/); - ($gene_id) = $gene_id =~ /(ENS.*G\d{11})/; - - # ---------------------------------------- - # Feature name & analysis - - $feature{NAME} = $motif_name; - $feature{INFLUENCE} = "unknown"; # TODO - what does cisRed store? - $feature{ANALYSIS_ID} = $analysis{cisRed}; - - # ---------------------------------------- - # Factor - - # If $group_id is present in %group_sizes we want to create or reuse a factor. - # If not, this feature is not associated with any factor. - # If the factor is to be created, its name is crtHsapXX where XX is group_id. - # TODO - other species prefixes - - $feature{FACTOR_ID} = $blank_factor_id; - if ($group_name && $group_name ne '' && $group_name !~ /\s/) { - my $factor_id = $factor_ids_by_name{$group_name}; - - if (!$factor_id) { # create one - my %factor; - $factor_id = $highest_factor_id + 1; - $factor{INTERNAL_ID} = $factor_id; - $factor{NAME} = $group_name; - $factor{TYPE} = 'NULL'; - push @factors, \%factor; - $factor_ids_by_name{$factor{NAME}} = $factor_id; - $highest_factor_id = $factor_id; - #print join(" ", ("Factor: ", $factor{ID}, $factor{NAME}, $factor{TYPE})) . "\n"; - } - $feature{FACTOR_ID} = $factor_id; - } - - # ---------------------------------------- - # Seq_region ID and co-ordinates, projected if necessary - - my $chr_slice = $slice_adaptor->fetch_by_region('chromosome', $chromosome, undef, undef, undef, $old_assembly); - - if (!$chr_slice) { - print STDERR "Can't get slice for $chromosome:$start:$end\n"; - next; - } - - my $seq_region_id = $slice_adaptor->get_seq_region_id($chr_slice); - - if (!$seq_region_id) { - print STDERR "Can't get seq_region_id for chromosome $chromosome\n"; - next; - } - - $feature{SEQ_REGION_ID} = $seq_region_id; - + #($gene_id) = $gene_id =~ /(ENS.*G\d{11})/; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for motif $motif_name\n"; + $skipped++ + next; + } + } + + #get feature_type first + + #we are not maintaining this link in the DB! + #Do we need another xref for this or a different table? + + + if ($group_name && $group_name ne '' && $group_name !~ /\s/) { + + if(! exists $features_by_group{$group_name}){ + $features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name); + + if(! defined $features_by_group{$group_name}){ + ($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => 'crtHsap'.$group_name, + -class => 'Regulatory Motif', + -description => 'cisRED group motif', + ))}; + } + } + }else{ + throw('Found cisRED feature $motif_name with no group_name, unable to defined feature_type'); + } + + my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new( + -display_label => $motif_name, + -start => $start, + -end => $end, + -strand => $strand, + -feature_type => $features_by_group{$group_name} + -feature_set => $self->{'feature_sets'}{'cisRED group motifs'}, + -slice => $slice_cache{$chromosome}, + ); + + + # project if necessary if ($new_assembly) { + $feature = $self->project_feature($feature, $new_assembly); - #print join("\t", "OLD: ", $start, $end, $strand, $chromosome, $motif_name) . "\n"; - - my $projected_feature = $self->project_feature($start, $end, $strand, $chromosome, $chr_slice, $dummy_analysis, $new_assembly, $slice_adaptor, $motif_name); - - $coords_changed++ if ($projected_feature->start() != $start || $projected_feature->end() != $end); + if(! defined $feature){ + $skipped ++; + next; + } - $start = $projected_feature->start(); - $end = $projected_feature->end(); - $strand = $projected_feature->strand(); - $feature{SEQ_REGION_ID} = $slice_adaptor->get_seq_region_id($projected_feature->feature_Slice); - - #print join("\t", "NEW: ", $start, $end, $strand, $chromosome, $motif_name) . "\n"; - - } - - $feature{START} = $start; - $feature{END} = $end; - $feature{STRAND} = $strand; - - - # ---------------------------------------- - # Ensembl object - always a gene in cisRed - - my $ensembl_id = $stable_id_to_internal_id->{gene}->{$gene_id}; - - if (!$ensembl_id) { - print STDERR "Can't get ensembl internal ID for $gene_id, skipping\n"; - print join("-", $motif_name, $chromosome, $start, $end, $strand, $group_name, $gene_id, "\n"); - $skipped++; - next; + #$coords_changed++ if ($projected_feature->start() != $start || $projected_feature->end() != $end); } - $feature{ENSEMBL_TYPE} = "Gene"; - $feature{ENSEMBL_ID} = $ensembl_id; - - # ---------------------------------------- - # Feature internal ID - - $feature{INTERNAL_ID} = $feature_internal_id++; - - # ---------------------------------------- - # Evidence - - $feature{EVIDENCE} = ""; - - # ---------------------------------------- - # Add to object to be returned - - push @features, \%feature; + ($feature) = @{$extf_adaptor->store($feature)}; + $cnt++; + + #Build Xref here + if($gene_id){ + + #need to take xref core dbname as a parameter + #defaulting to current db at present + + #should this not have some 'gene' e.g. core_gene? + + if(! exists $display_name_cache->{$gene_id}){ + warn "Cannot get ensembl gene id $gene_id for motif $motif_name\n"; + $skipped_xref++; + next; + } + + my $dbentry = Bio::EnsEMBL::DBEntry->new( + -dbname => 'core', + -release => $self->db->dnadb->dbname, + -status => 'KNOWNXREF', + -display_label_linkable => 1, + -db_display_name => $self->db->dnadb->dbname, + -type => 'MISC', + -dbprimary_acc => $gene_id, + -display_name => $display_name_cache->{$gene_id}, + -info_type => 'MISC' + #could have version here if we use the correct dnadb to build the cache + ); + $dbentry_adaptor->store($dbentry); + } + } + + close FILE; - #print "Feature: "; - #foreach my $field (keys %feature) { - # print $field . ": " . $feature{$field} . " "; - #} - #print "\n"; + print ":: Stored $cnt cisRED ExternalFeature motif\n"; + print ":: Skipped $skipped cisRED ExternalFeature motif imports\n"; + print ":: Skipped an additional $skipped_xref DBEntry imports\n"; - } - close FILE; # ---------------------------------------- # Search regions # read search_regions.txt from same location as $file - my $search_regions_file = dirname($file) . "/search_regions.txt"; - my $skipped_sr = 0; - - my @search_regions; - print "Parsing search regions from $search_regions_file\n"; + my $search_regions_file = dirname($file) . "/search_regions.txt"; + $skipped = 0; + $cnt = 0; + + print ":: Parsing cisRED search regions from $search_regions_file\n"; open (SEARCH_REGIONS, "<$search_regions_file") || die "Can't open $search_regions_file"; <SEARCH_REGIONS>; # skip header while (<SEARCH_REGIONS>) { chomp; my ($id, $chromosome, $start, $end, $strand, $ensembl_gene_id) = split; - my $gene_id = $stable_id_to_internal_id->{gene}->{$ensembl_gene_id}; + my $gene_id = $display_name_cache->{gene}->{$ensembl_gene_id}; + my $name = "CisRed_Search_$id"; + if (!$gene_id) { warn("Can't get internal ID for $ensembl_gene_id\n"); - $skipped_sr++; - next; - } - my $sr_chr_slice = $slice_adaptor->fetch_by_region('chromosome', $chromosome, undef, undef, undef, $old_assembly); - if (!$sr_chr_slice) { - print STDERR "Can't get slice for $chromosome:$start:$end\n"; - next; - } - my $sr_seq_region_id = $slice_adaptor->get_seq_region_id($sr_chr_slice); - if (!$sr_seq_region_id) { - print STDERR "Can't get seq_region_id for chromosome $chromosome\n"; + $skipped++; next; } - my $name = "CisRed_Search_$id"; + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for for search region $name\n"; + next; + } + } + + my $search_feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $name, + -start => $start, + -end => $end, + -strand => $strand, + -feature_type => $self->{'feature_sets'}{'cisRED group motifs'}->feature_type, + -feature_set => $self->{'feature_sets'}{'cisRED group motifs'}, + -slice => $slice_cache{$chromosome}, + ); + + # project if necessary if ($new_assembly) { - - #print join("\t", "OLD: ", $start, $end, $strand, $chromosome, $name) . "\n"; - - my $projected_region = $self->project_feature($start, $end, $strand, $chromosome, $sr_chr_slice, $dummy_analysis, $new_assembly, $slice_adaptor, "CisRed_Search_$id"); - - $start = $projected_region->start(); - $end = $projected_region->end(); - $strand = $projected_region->strand(); - - #print join("\t", "NEW: ", $start, $end, $strand, $chromosome, $name) . "\n"; - - } - - my %search_region; - $search_region{NAME} = $name; - $search_region{SEQ_REGION_ID} = $sr_seq_region_id; - $search_region{START} = $start; - $search_region{END} = $end; - $search_region{STRAND} = ($strand =~ /\+/ ? 1 : -1); - $search_region{ENSEMBL_OBJECT_TYPE} = 'Gene'; - $search_region{ENSEMBL_OBJECT_ID} = $gene_id; - $search_region{ANALYSIS_ID} = $analysis{cisred_search}; - push @search_regions, \%search_region; - + $search_feature = $self->project_feature($search_feature); + + if(! defined $search_feature){ + $skipped ++; + next; + } + } + + $extf_adaptor->store($search_feature); } - close(SEARCH_REGIONS); - - - # ---------------------------------------- - $result{FEATURES} = \@features; - $result{FACTORS} = \@factors; - $result{SEARCH_REGIONS} = \@search_regions; - - print "Parsed " . scalar(@{$result{FEATURES}}) . " features, " . scalar(@{$result{FACTORS}}) . " factors and " . scalar(@{$result{SEARCH_REGIONS}}) . " search regions\n"; + close(SEARCH_REGIONS); - print "Skipped $skipped features and $skipped_sr search regions due to missing stable ID - internal ID mappings\n"; + + print ":: Stored $cnt cisRED search region ExternalFeatures\n"; + print ":: Skipped $skipped cisRED search region ExternalFeatures\n"; - print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly); + #print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly); - return \%result; + return; } -sub new { - - my $self = {}; - bless $self, "RegulatoryFeatureParser::cisred"; - return $self; -} 1; diff --git a/misc-scripts/regulatory_regions/RegulatoryFeatureParser/miranda.pm b/misc-scripts/regulatory_regions/RegulatoryFeatureParser/miranda.pm index 21c0044aacb41abfdf71865fc8d3b9c609be45b8..046697a5ca95bf12d344f7de3a73b2781f716d2a 100644 --- a/misc-scripts/regulatory_regions/RegulatoryFeatureParser/miranda.pm +++ b/misc-scripts/regulatory_regions/RegulatoryFeatureParser/miranda.pm @@ -8,7 +8,9 @@ use strict; # Similarity hsa-miR-23a miRanda miRNA_target 1 919787 919807 + . 71 transcript id "ENST00000310998" use RegulatoryFeatureParser::BaseParser; -use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; + use vars qw(@ISA); @ISA = qw(RegulatoryFeatureParser::BaseParser); @@ -18,44 +20,94 @@ use vars qw(@ISA); # - arrayref of features # - arrayref of factors -sub parse { - my ($self, $db_adaptor, $file, $old_assembly, $new_assembly) = @_; - my %result; - print "Parsing $file with miranda parser\n"; +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #Set default feature_type and feature_set config + $self->{'feature_types'} = { + 'miRanda' => { + class => 'RNA', + description => 'miRanda microRNA', + }, + }; + $self->{feature_sets} = { + 'miRanda miRNA' => { + feature_type => \$self->{'feature_types'}{'cisRED Search Region'}, + analysis => + { + -logic_name => 'miRanda', + -description => 'miRanda microRNA target prediction (http://cbio.mskcc.org/mirnaviewer/)', + -display_label => 'miRanda', + -displayable => 1, + }, + }, + }; + + + + $self->validate_and_store_feature_types; + $self->set_feature_sets; - my $feature_internal_id = ($self->find_max_id($db_adaptor, "regulatory_feature")) + 1; - my $highest_factor_id = ($self->find_max_id($db_adaptor, "regulatory_factor")) + 1; + return $self; +} - my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor(); - my $slice_adaptor = $db_adaptor->get_SliceAdaptor(); - my @features; - my @factors; - my %factor_ids_by_name; # name -> factor_id - my %feature_objects; - my %anal; - # TODO - regulatory_factor_coding - my $stable_id_to_internal_id = $self->build_stable_id_cache($db_adaptor); +sub parse_and_load{ + my ($self, $file, $old_assembly, $new_assembly) = @_; + + print ":: Parsing miRanda data from:\t$file\n"; + + my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor(); + my %features_by_name; # name -> feature_type + my %slice_cache; + my $display_name_cache = $self->build_display_name_cache('gene'); # this object is only used for projection - my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisRedProjection'); + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'miRandaProjection'); + my $skipped = 0; + open (FILE, "<$file") || die "Can't open $file"; while (<FILE>) { - - next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/); - - my %feature; + next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/); my ($group, $seq, $method, $feature, $chr, $start, $end, $str, $phase, $score, $pvalue, $type, $id_ignore, $id) = split; my $strand = ($str =~ /\+/ ? 1 : -1); $id =~ s/[\"\']//g; # strip quotes + $id .= ':'.$seq; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chr} = $self->slice_adaptor->fetch_by_region('chromosome', + $chr, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chr} = $self->slice_adaptor->fetch_by_region('chromosome', $chr); + } + + if(! defined $slice_cache{$chr}){ + warn "Can't get slice $chr for sequence $id\n"; + $skipped++; + next; + } + } + + + + # ---------------------------------------- # Feature name @@ -63,12 +115,25 @@ sub parse { # For miRNA_target, individual features don't have a unique name, so create # a composite one. Also set influence. - $feature{NAME} = $id .":" . $seq; + $feature{INFLUENCE} = "negative"; # ---------------------------------------- # Factor + if(! exists $features_by_group{$group_name}){ + $features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name); + + if(! defined $features_by_group{$group_name}){ + ($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => 'crtHsap'.$group_name, + -class => 'Regulatory Motif', + -description => 'cisRED group motif', + ))}; + } + } + # $seq is the name of a factor - if it's already there, find its ID, otherwise add it my $factor_id = $factor_ids_by_name{$seq}; if (!$factor_id) {