From 97ac054637d6e2c5e081cafb876656c3b5f08a3d Mon Sep 17 00:00:00 2001
From: James Allen <jallen@ebi.ac.uk>
Date: Thu, 1 Nov 2012 16:34:16 +0000
Subject: [PATCH] 'FlybaseParser.pm' is the current parser; removing this one
 to prevent confusion...

---
 .../XrefParser/Flybase_dmel_GFFv3_Parser.pm   | 1171 -----------------
 1 file changed, 1171 deletions(-)
 delete mode 100644 misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm

diff --git a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm
deleted file mode 100644
index 5d720ce54f..0000000000
--- a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm
+++ /dev/null
@@ -1,1171 +0,0 @@
-# Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs.
-#
-# Files actually contain both types of xref, distinguished by ID line;
-#
-# This module will read in the fly gff text file and make xrefs from the information in the file.
-# First of all, it read knows what all the gene, transcript and translation types are, found in column 3 of the gff file:
-# Gene = gene
-# Transcript = mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA
-# Translation = protein (could include CDS here but haven't ?) 
-# 
-# ID=FBgn => flybase_gene_id
-# ID=FBtr => flybase_transcript_id
-# ID=FBpp => flybase_polypeptide_id
-# Name=CG0123 => FlyBaseName_gene
-# Name=CG0123-RA => FlyBaseName_transcript
-# Name=CG0123-PA => FlyBaseName_translations
-# Dbxref=FlyBase:FBan => flybase_annotation_id
-# Dbxref=FlyBase_Annotation_IDs:CG0123 => gadfly_gene_cgid
-# Dbxref=FlyBase_Annotation_IDs:CG0123-RA => gadfly_transcript_cgid
-# Dbxref=FlyBase_Annotation_IDs:CG0123-PA => gadfly_translation_id
-# Alias= => flybase_synonym
-#
-# For each line in the gff file for the above list of genes, transcript and translations, the following happens:
-# The unique_id is read in from ID= (FBgn, FBtr, FBpp). This is the direct xref for all xrefs of this entry.
-# An xref is made for the entry, using the ID as the xref's accession. Synonyms from Alias= are added to this xref.
-# The Name (Name=) is read in and added as an xref. Synonyms from Alias= are added to this xref.
-# All entries from Dbxref= are added in as xrefs for the entry; they have no synonyms.
-  
-#2L gene [...]  ID=CG11023;Dbxref=FlyBase:FBan0011023,FlyBase:FBgn0031208;gbunit=AE003590;synonym=CG11023
-#2L mRNA [...]  ID=CG11023-RA;Dbxref=FlyBase:FBtr008,FlyBase:FBgn003;dbxref_2nd=Gadfly:CG11023-RA;synonym=CG23-RA
-#3R     FlyBase gene    8084471 8128509 .       +       .
-#ID=FBgn0003651;Name=svp;Alias=FBgn0011337,FBgn0011492,FBgn0011510,FBgn0038010,FBgn0063263;Ontology_term=SO:0000010,SO:0000087,GO:0004872,GO:0007270,GO:0042331,GO:0005515,GO:0007419,GO:0007503,GO:0045449,GO:0004879,GO:0003700,GO:0005634,GO:0007465,GO:0007462,GO:0007464,GO:0007510,GO:0005737,GO:0007507,GO:0007417,GO:0001700,GO:0006357,GO:0007165,GO:0043565,GO:0003707,GO:0008270,GO:0048749,GO:0001752;Dbxref=FlyBase:FBan0011502,FlyBase_Annotation_IDs:CG11502,INTERPRO:IPR013088,GB:AC007724,GB:AE003695,GB_protein:AAF54773,GB_protein:AAN13541,GB_protein:AAF54774,GB:AI108883,GB:AI402121,GB:AY075272,GB_protein:AAL68139,GB:AY119490,GB_protein:AAM50144,GB:AY129452,GB_protein:AAM76194,GB:BG633933,GB:BI167911,GB:CZ468719,GB:CZ472606,GB:CZ475640,GB:CZ475641,GB:CZ477001,GB:CZ482253,GB:CZ485541,GB:CZ485543,GB:G00472,GB:M28863,GB_protein:AAA62770,GB:M28864,GB_protein:AAA03014,UniProt/Swiss-Prot:P16375,UniProt/Swiss-Prot:P16376,UniProt/TrEMBL:Q8MRP3,INTERPRO:IPR000536,INTERPRO:IPR001628,INTERPRO:IPR001723,INTERPRO:IPR003068,INTERPRO:IPR008946,INTERPRO:IPR013629,dedb:9161,flygrid:66603,hybrigenics:521960,if:/newgene/sevenup.htm,orthologs:ensAG:ENSANGG00000002454,orthologs:ensAM:ENSAPMG00000000116,orthologs:ensCF:ENSCAFG00000008076,orthologs:ensDM:CG12744,orthologs:ensDR:ENSDARG00000017168,orthologs:ensFR:SINFRUG00000127451,orthologs:ensGG:ENSGALG00000007000,orthologs:ensHS:ENSG00000185551,orthologs:ensMM:ENSMUSG00000030551,orthologs:ensPT:ENSPTRG00000007484,orthologs:ensRN:ENSRNOG00000010308,orthologs:ensTN:GSTENG00006911001,orthologs:modCB:WBGene00030075;cyto_range=87B4-87B5;gbunit=AE014297;
-
-
-package XrefParser::Flybase_dmel_GFFv3_Parser;
-
-use strict;
-use warnings;
-use Carp;
-use POSIX qw(strftime);
-use File::Basename;
-use Bio::EnsEMBL::Utils::Exception;
-use URI::Escape;
-
-use base qw( XrefParser::BaseParser );
-
-my %cache_source =();
-
-my $verbose;
-
-sub new {
-  my $proto = shift;
-  my $self = $proto->SUPER::new(@_);
-
-  $self->external_source_db_name('flybase_gff');
-
-  #  my @gff_obj =qw( CDS exon gene mRNA ncRNA pseudogene rRNA snRNA snoRNA tRNA );
-  # this array may need to change between releases so check that it's updated
-  my @gff_obj =qw( gene mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA);
-  $self->gff_object_types(\@gff_obj);
-
-  #
-  # hard-coded field separators out of gff
-  #
-
-  $self->gff_name("Name=");
-  $self->gff_ontology("Ontology_term=");
-  $self->gff_synonym("Alias=");
-  $self->gff_dbxref("Dbxref=");
-
-  #
-  # hard-coded source-names for different objects out of ./sql/populate_metadata.sql
-  #
-  # For Alias
-  $self->source_name_synonym('flybase_synonym'); # source for any Alias
-  # For Name
-  $self->source_name_name_prefix('FlyBaseName_'); # source for any Name
-  # For Dbxref
-  $self->source_name_fbgn('flybase_gene_id');         # source-name for ID=FBgn
-  $self->source_name_fbtr('flybase_transcript_id');   # source-name for ID=FBtr
-  $self->source_name_fbpp('flybase_polypeptide_id');  # source-name for ID=FBpp
-  $self->source_name_fban('flybase_annotation_id');   # source-name for ID=FBan
-  $self->source_name_gadfly_gene('gadfly_gene_cgid');                # For Dbxref=FlyBase_Annotation_IDs
-  $self->source_name_gadfly_transcript('gadfly_transcript_cgid');    # For Dbxref=FlyBase_Annotation_IDs
-  $self->source_name_gadfly_translation('gadfly_translation_cgid');  # For Dbxref=FlyBase_Annotation_IDs
-  $self->source_name_affymetrix('AFFY_DrosGenome1');    # For Dbxref=Affymetrix
-  $self->source_name_dgrc1('DGRC-1');   # For Dbxref=DGRC-1 
-  $self->source_name_dgrc2('DGRC-2');   # For Dbxref=DGRC-2
-  $self->source_name_drsc('DRSC');      # For Dbxref=DRSC
-  $self->source_name_epd('EPD');        # For Dbxref=EPD
-  $self->source_name_flyreg('FlyReg');  # For Dbxref=FlyReg
-  $self->source_name_gb('EMBL');        # For Dbxref=GB
-  $self->source_name_gbprotein('protein_id');   # For Dbxref=GB_protein
-  $self->source_name_gcr('GPCR');       # For Dbxref=GCR
-  $self->source_name_gi('GI');          # For Dbxref=GI
-  $self->source_name_go('GO');          # For Dbxref=GO
-  $self->source_name_genomeRNAi('GenomeRNAi');  # For Dbxref=GenomeRNAi
-  $self->source_name_interpro('Interpro');      # For Dbxref=INTERPRO
-  $self->source_name_merops('MEROPS');  # For Dbxref=MEROPS
-  $self->source_name_miRBase('miRBase');        # For Dbxref=miRBase
-  $self->source_name_mitodrome('MitoDrome');    # For Dbxref=MitoDrome
-  $self->source_name_nrl3d('PDB');      # For Dbxref=NRL_3D
-  $self->source_name_pdb('PDB');        # For Dbxref=PDB
-  $self->source_name_rfam('RFAM');      # For Dbxref=Rfam
-  $self->source_name_tf('TransFac');    # For Dbxref=TF
-  $self->source_name_uniprotsp('Uniprot/SWISSPROT');    # For Dbxref=UniProt/Swiss-Prot
-  $self->source_name_uniprottr('Uniprot/SPTREMBL');     # For Dbxref=UniProt/TrEMBL
-  $self->source_name_bdgpinsituexpr('BDGP_insitu_expr');        # For Dbxref=bdgpinsituexpr
-  $self->source_name_dedb('DEDb');      # For Dbxref=dedb
-  $self->source_name_drosdel('DrosDel');        # For Dbxref=drosdel
-  $self->source_name_flygrid('FlyGrid');        # For Dbxref=flygrid
-  $self->source_name_hybrigenics('hybrigenics');        # For Dbxref=hybrigenics
-  $self->source_name_if('InteractiveFly');      # For Dbxref=if
-  $self->source_name_prefix_ensAGgene('Ens_Ag_gene');    # For Dbxref=ensAG
-  $self->source_name_prefix_ensAMgene('Ens_Am_gene');    # For Dbxref=ensAM
-  $self->source_name_prefix_ensCEgene('Ens_Ce_gene');    # For Dbxref=ensCE
-  $self->source_name_prefix_ensCFgene('Ens_Cf_gene');    # For Dbxref=ensCF
-  $self->source_name_prefix_ensDMgene('Ens_Dm_gene');    # For Dbxref=ensDM
-  $self->source_name_prefix_ensDRgene('Ens_Dr_gene');    # For Dbxref=ensDR
-  $self->source_name_prefix_ensFRgene('Ens_Fr_gene');    # For Dbxref=ensFR
-  $self->source_name_prefix_ensGGgene('Ens_Gg_gene');    # For Dbxref=ensGG
-  $self->source_name_prefix_ensHSgene('Ens_Hs_gene');    # For Dbxref=ensHS
-  $self->source_name_prefix_ensMMgene('Ens_Mm_gene');    # For Dbxref=ensMM
-  $self->source_name_prefix_ensPTgene('Ens_Pt_gene');    # For Dbxref=ensPT
-  $self->source_name_prefix_ensRNgene('Ens_Rn_gene');    # For Dbxref=ensRN
-  $self->source_name_prefix_ensTNgene('Ens_Tn_gene');    # For Dbxref=ensTN
-  $self->source_name_prefix_modCBgene('modCB_gene');     # For Dbxref=modCB
-  $self->source_name_prefix_modCEgene('modCE_gene');     # For Dbxref=modCE
-  $self->source_name_prefix_modDDgene('modDD_gene');     # For Dbxref=modDD
-
-  my @gene_types = qw (gene) ;
-  my @translation_types = qw (protein);
-  # The transcript_types may change from release to release so check that this list is up-to-date
-  my @transcript_types = qw (mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA);
-
-  $self->gene_types(\@gene_types) ;
-  $self->translation_types(\@translation_types) ;
-  $self->transcript_types(\@transcript_types) ;
-
-  $self->{'_xrefs'}=[];
-  $self->{'_direct_xrefs'}=[];
-  $self->{'_synonyms'}={};
-  
-  return $self;
-}
-
-
-# --------------------------------------------------------------------------------
-
-
-
-# large number of calls to SQL should now be speeded up as cached.
-sub get_source{
-  my ($self, $name) =@_;
-
-  if(!defined($cache_source{$name})){
-    $cache_source{$name} = $self->get_source_id_for_source_name($name)
-  }
-
-  return $cache_source{$name};
-}
-
-sub run {
-
-
-  my ($self, $ref_arg) = @_;
-  my $source_id    = $ref_arg->{source_id};
-  my $species_id   = $ref_arg->{species_id};
-  my $files        = $ref_arg->{files};
-  my $verbose      = $ref_arg->{verbose};
-
-  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
-    croak "Need to pass source_id, species_id, files and rel_file as pairs";
-  }
-  $verbose |=0;
-
-  my $file = @{$files}[0];
-
-  my $species_name;
-
-  if(!defined($species_id)){
-    ($species_id, $species_name) = $self->get_species($file);
-  }
-  $self->species_id($species_id) ;
-
-  my $external_source_db_name = $self->external_source_db_name() ;
-  my $flybase_source_id = $self->get_source($external_source_db_name);
-
-  if(!$self->create_xrefs($flybase_source_id, $file)){
-    return 1;
-  }
-
-  my @xrefs = @{$self->xrefs};
-
-  $self->relink_synonyms_to_xrefs();
-
-  my @direct_xrefs = @{ $self->direct_xrefs } ;
-
-# # delete previous if running directly rather than via BaseParser
-#  if (!defined(caller(1))) {
-#    print "Deleting previous xrefs for these sources\n" if($verbose);
-#    $self->delete_by_source(\@xrefs);
-#  }
-  print "... parsed.\n" if($verbose);
-  print STDERR "uploading ".scalar(@xrefs)." xrefs's\n" if($verbose);
-  $self->upload_xref_object_graphs(\@xrefs);
-
-  print STDERR "uploading ".scalar(@direct_xrefs)." direct-xrefs's\n" if($verbose);
-  $self->upload_direct_xrefs(\@direct_xrefs);
-
-  return 0;
-}
-
-sub relink_synonyms_to_xrefs{
-  my $self = shift;
-  foreach my $x (@{$self->xrefs} ){
-    my $src_name = $self->get_source_name_for_source_id($x->{SOURCE_ID});
-    if ($src_name =~ m/^FlyBaseName_/ || $src_name =~ m/^flybase_.*_id$/) {
-      $x->{SYNONYMS} = $self->get_synonyms($x->{ENSEMBL_STABLE_ID});
-    }
-  }
-}
-
-# --------------------------------------------------------------------------------
-# Parse file into array of xref objects
-# parse
-
-sub create_xrefs {
-  my ($self, $flybase_source_id, $file) = @_;
-
-  print STDERR "starting to parse $file...." if($verbose);
-
-  my $gff_io = $self->get_filehandle($file);
-
-  if ( !defined $gff_io ) {
-    print STDERR "ERROR: Can't open the GFF file $file\n";
-    return 0;
-  }
-
-  while ( $_ = $gff_io->getline() ) {
-    chomp;
-    my @col = split /\t/;
-    if($col[3]){
-
-      # test if line contains information for object wanted (CDS,mRNA,gene,..)
-      if ( $self->line_contains_object_to_process( $col[2] ) ){
-        # work out if we have a gene, transcript or translation
- 	my $type = $self->set_ensembl_object_type($col[2]);
-        # the 9th column contains all the attributes
-	my @desc = split /\;/,$col[8];
-        # the ID= is always the first element of this array
-	my $unique_id = shift @desc;
-	if(!$unique_id=~m/ID=/){
-	  throw("parse-error: There seems to be no Identifier: $unique_id. Suspicous!");
-	  # print "parse-error: There seems to be no Identifier: $unique_id. Suspicous!";
-	  # return 0;
-	}
-        # for a gene, this will be FBgn, for a transcript this will be FBtr, etc
-	$unique_id =~s/ID=//g;
-        $self->make_id_xref($unique_id,$type);
-	# set up xref-entry for EVERY single item
-	foreach my $item (@desc) {
-          $self->set_flybase_synonyms($item,$unique_id);
-	  # make all xrefs for type "Name=" in desc-field
-          # these are FlyBaseName_gene for genes, FlyBaseName_transcript for transcripts, etc
-	  $self->make_name_xref($item,$unique_id,$type);
-
-	  # make all xrefs for type "Name=" in desc-field
-	  $self->make_dbxref_xref($item,$unique_id,$type);
-	}
-      }
-	  
-    } # we don't want to read the line otherwise
-	
-  } # while ( $_ = $gff_io->getline() ) {
-
-  $gff_io->close();
-
-  return 1;
-}
-
-sub set_ensembl_object_type{
-  my ($self,$t) = @_ ; # $t is identifier in gff for object : CDS,mRNA,gene,pseudogene,snRNA,....
-
-  for my $hc (@{ $self->gene_types } ){
-    if ($t=~m/^$hc$/){
-      return 'gene';
-    }
-  }
-  for my $hc (@{ $self->translation_types } ){
-    if ($t=~m/^$hc$/){
-      return 'translation';
-    }
-  }
-  for my $hc (@{ $self->transcript_types} ){
-    if ($t=~m/^$hc$/){
-      return 'transcript';
-    }
-  }
-}
-
-
-sub make_dbxref_xref{
-  my ($self,$item,$unique_id,$type) = @_;
-  # item = attribute 
-  # unique_id = ID
-  # type = gene, transcript, translation
-  my ($xref);
-  my $tg1 = $self->gff_dbxref ;
-  my $tg2 = $self->gff_ontology;
-
-  if ($item=~/$tg1/ || $item=~/$tg2/){ # Dbxref=
-    # split the xrefs up into a list
-    my $dbx1 = get_fields($item,$tg1);
-    my @dbx;
-    push @dbx, @{$dbx1} if $dbx1;
-
-    foreach my $dbx (@dbx) {
-      my $src_id = undef;
-      my $source_type = undef;
-
-      if ($dbx =~m/FlyBase:/){
-	$dbx =~s/FlyBase://g;
-
-	if($dbx=~m/FBgn/ and $type eq "gene"){
-	  $src_id = $self->get_source($self->source_name_fbgn);
-	}elsif ($dbx =~m/FBtr/ and $type eq "transcript"){
-	  $src_id = $self->get_source($self->source_name_fbtr);
-	}elsif ($dbx =~m/FBpp/ and $type eq "translation"){
-	  $src_id = $self->get_source($self->source_name_fbpp);
-	}elsif ($dbx =~m/FBan/){
-	  $src_id = $self->get_source($self->source_name_fban);
-	}
-      }elsif($dbx =~m/FlyBase_Annotation_IDs:/){
-	$dbx =~s/FlyBase_Annotation_IDs://g;
-	if($type eq "gene"){
-	  $src_id = $self->get_source($self->source_name_gadfly_gene) ;
-	}
-	elsif($type eq "translation"){
-	  $src_id = $self->get_source($self->source_name_gadfly_translation);
-	}
-	elsif($type eq "transcript"){
-	  $src_id = $self->get_source($self->source_name_gadfly_transcript); 
-	}
-      } elsif ($dbx =~m/Affymetrix:/) {
-        $dbx =~s/Affymetrix://g;
-        $src_id = $self->get_source($self->source_name_affymetrix) ;
-      } elsif ($dbx =~m/DGRC-1:/) {
-        $dbx =~s/DGRC-1://g;
-        $src_id = $self->get_source($self->source_name_dgrc1) ;
-      } elsif ($dbx =~m/DGRC-2:/) {
-        $dbx =~s/DGRC-2://g;
-        $src_id = $self->get_source($self->source_name_dgrc2);
-      } elsif ($dbx =~m/DRSC:/) {
-        $dbx =~s/DRSC://g;
-        $src_id = $self->get_source($self->source_name_drsc);
-      } elsif ($dbx =~m/EPD:/) {
-        $dbx =~s/EPD://g;
-        $src_id = $self->get_source($self->source_name_epd);
-      } elsif ($dbx =~m/FlyReg:/) {
-        $dbx =~s/FlyReg://g;
-        $src_id = $self->get_source($self->source_name_flyreg);
-      } elsif ($dbx =~m/GB:/) {
-        $dbx =~s/GB://g;
-        $src_id = $self->get_source($self->source_name_gb);
-      } elsif ($dbx =~m/GB_protein:/) {
-        $dbx =~s/GB_protein://g;
-        $src_id = $self->get_source($self->source_name_gbprotein);
-      } elsif ($dbx =~m/GCR:/) {
-        $dbx =~s/GCR://g;
-        $src_id = $self->get_source($self->source_name_gcr);
-      } elsif ($dbx =~m/GI:/) {
-        $dbx =~s/GI://g;
-        $src_id = $self->get_source($self->source_name_gi);
-      } elsif ($dbx =~m/GO:/) {
-        # this is an ontology_term
-        $dbx =~s/GO://g;
-        $src_id = $self->get_source($self->source_name_go);
-      } elsif ($dbx =~m/GenomeRNAi:/) {
-        $dbx =~s/GenomeRNAi://g;
-        $src_id = $self->get_source($self->source_name_genomeRNAi);
-      } elsif ($dbx =~m/INTERPRO:/) {
-        $dbx =~s/INTERPRO://g;
-        $src_id = $self->get_source($self->source_name_interpro);
-      } elsif ($dbx =~m/MEROPS:/) {
-        $dbx =~s/MEROPS://g;
-        $src_id = $self->get_source($self->source_name_merops);
-      } elsif ($dbx =~m/MIR:/) {
-        $dbx =~s/MIR://g;
-        $src_id = $self->get_source($self->source_name_miRBase);
-      } elsif ($dbx =~m/MITODROME:/) {
-        $dbx =~s/MITODROME://g;
-        $src_id = $self->get_source($self->source_name_mitodrome);
-      } elsif ($dbx =~m/NRL_3D:/) {
-        $dbx =~s/NRL_3D://g;
-        $src_id = $self->get_source($self->source_name_nrl3d);
-      } elsif ($dbx =~m/PDB:/) {
-        $dbx =~s/PDB://g;
-        $src_id = $self->get_source($self->source_name_pdb);
-      } elsif ($dbx =~m/Rfam:/) {
-        $dbx =~s/Rfam://g;
-        $src_id = $self->get_source($self->source_name_rfam);
-      } elsif ($dbx =~m/SO:/) {
-        # do nothing, we don't collect these
-      } elsif ($dbx =~m/TF:/) {
-        $dbx =~s/TF://g;
-        $src_id = $self->get_source($self->source_name_tf);
-      } elsif ($dbx =~m/UniProt\/Swiss-Prot:/) {
-        $dbx =~s/UniProt\/Swiss-Prot://g;
-        $src_id = $self->get_source($self->source_name_uniprotsp);
-      } elsif ($dbx =~m/UniProt\/TrEMBL:/) {
-        $dbx =~s/UniProt\/TrEMBL://g;
-        $src_id = $self->get_source($self->source_name_uniprottr);
-      } elsif ($dbx =~m/bdgpinsituexpr:/) {
-        $dbx =~s/bdgpinsituexpr://g;
-        $src_id = $self->get_source($self->source_name_bdgpinsituexpr);
-      } elsif ($dbx =~m/dedb:/) {
-        $dbx =~s/dedb://g;
-        $src_id = $self->get_source($self->source_name_dedb);
-      } elsif ($dbx =~m/drosdel:/) {
-        $dbx =~s/drosdel://g;
-        $src_id = $self->get_source($self->source_name_drosdel);
-      } elsif ($dbx =~m/flygrid:/) {
-        $dbx =~s/flygrid://g;
-        $src_id = $self->get_source($self->source_name_flygrid);
-      } elsif ($dbx =~m/hybrigenics:/) {
-        $dbx =~s/hybrigenics://g;
-        $src_id = $self->get_source($self->source_name_hybrigenics);
-      } elsif ($dbx =~m/if:/) {
-        $dbx =~s/if://g;
-        $src_id = $self->get_source($self->source_name_if);
-      } elsif ($dbx =~m/orthologs:ensAG:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensAGgene);
-      } elsif ($dbx =~m/orthologs:ensAM:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensAMgene);
-      } elsif ($dbx =~m/orthologs:ensCE:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensCEgene);
-      } elsif ($dbx =~m/orthologs:ensCF:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensCFgene);
-      } elsif ($dbx =~m/orthologs:ensDM:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensDMgene);
-      } elsif ($dbx =~m/orthologs:ensDR:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensDRgene);
-      } elsif ($dbx =~m/orthologs:ensFR:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensFRgene);
-      } elsif ($dbx =~m/orthologs:ensGG:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensGGgene);
-      } elsif ($dbx =~m/orthologs:ensHS:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensHSgene);
-      } elsif ($dbx =~m/orthologs:ensMM:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensMMgene);
-      } elsif ($dbx =~m/orthologs:ensPT:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensPTgene);
-      } elsif ($dbx =~m/orthologs:ensRN:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensRNgene);
-      } elsif ($dbx =~m/orthologs:ensTN:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_ensTNgene);
-      } elsif ($dbx =~m/orthologs:modCB:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_modCBgene);
-      } elsif ($dbx =~m/orthologs:modCE:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_modCEgene);
-      } elsif ($dbx =~m/orthologs:modDD:/) {
-        $dbx =~s/orthologs://g;
-        $src_id = $self->get_source($self->source_name_prefix_modDDgene);
-      } else {
-        warning("Dbxref type not recognised : $dbx");
-      }
-
-      if ($src_id){ # only add xref entry for FBgn FBtr...
-	my $xref ;
-	$xref->{ACCESSION} = $dbx ;
-	$xref->{LABEL} = $dbx;
-	$xref->{SOURCE_ID} = $src_id;
-	$xref->{SPECIES_ID} = $self->species_id();
-	#$xref->{SYNONYMS} = $self->get_synonyms($unique_id);
-	$self->add_xref($xref);
-
-	if ($type){
-	  my $direct_xref;
-	  $direct_xref = $xref ;
-	  $direct_xref->{ENSEMBL_STABLE_ID} = $unique_id;
-	  $direct_xref->{ENSEMBL_TYPE} = $type;
-	  #$direct_xref->{LINKAGE_XREF}=undef;
-	  $self->add_direct_xref($direct_xref) if $type ;
-	}
-      }
-    }
-    return;
-  }
-}
-
-sub set_flybase_synonyms {
-  my ($self,$item,$unique_id) = @_; 
-  my $syn1 = $self->gff_synonym;
-
-  if ($item=~/$syn1/){
-    my $s1 = get_fields($item,$syn1);
-    my @syns;
-    push @syns, @{$s1} if $s1;
-    $self->add_synonym($unique_id,\@syns);
-    return \@syns;
-  }
-  return;
-}
-
-sub make_id_xref{
-  my ($self,$unique_id,$type) = @_;
-  my $xref=undef;
-
-  # make an xref
-  $xref->{ACCESSION} = $unique_id;
-  $xref->{LABEL} = $unique_id;
-  $xref->{SPECIES_ID} = $self->species_id();
-  $xref->{SYNONYMS} = $self->get_synonyms($unique_id);
-  my $type_s = $type;
-  if ($type eq "gene") {
-    $type_s = $self->source_name_fbgn();
-  } elsif ($type eq "transcript") {
-    $type_s = $self->source_name_fbtr();
-  } elsif ($type eq "translation") {
-    $type_s =  $self->source_name_fbpp();
-  } else {
-    throw ("Type $type not recognised");
-  }
-
-  $xref->{SOURCE_ID} =  $self->get_source($type_s);
-	$self->add_xref($xref);
-
-  # only allow Name on genes. This is a fix for Biomart really.
-  if (defined($xref) and $type){
-    my $direct_xref;
-    $direct_xref = $xref ;
-    $direct_xref->{ENSEMBL_STABLE_ID} = $unique_id;
-    $direct_xref->{ENSEMBL_TYPE} = $type;
-    $direct_xref->{LINKAGE_TYPE}='bla';
-    $direct_xref->{SYNONYMS} = $self->get_synonyms($unique_id);
-    $self->add_direct_xref($direct_xref);
-  }
-  return;
-}
-
-sub make_name_xref{
-  my ($self,$item,$unique_id,$type) = @_;
-  my $xref=undef;
-  my $target = $self->gff_name ;
-  if($item=~m/$target/){  ##Name=
-    #print "having $$gff_gene_name[0]\n" ;
-    # remove the Name= bit and split the names on a ','
-    my $gff_gene_name = get_fields ( $item, $target ) ;
-    throw("there is more than one id for item $item\n") if $$gff_gene_name[1];
-    $xref->{ACCESSION} = $$gff_gene_name[0];
-    $xref->{LABEL} = $$gff_gene_name[0];
-    $xref->{SPECIES_ID} = $self->species_id();
-    $xref->{SYNONYMS} = $self->get_synonyms($unique_id);
-    my $type_s = $type;
-    if($type eq "translation"){
-      $type_s = $type."s";
-    }
-    $xref->{SOURCE_ID} =  $self->get_source($self->source_name_name_prefix().$type_s);
-	$self->add_xref($xref);
-  }
-  # only allow Name on genes. This is a fix for Biomart really.
-  if (defined($xref) and $type){
-    my $direct_xref;
-    $direct_xref = $xref ;
-    $direct_xref->{ENSEMBL_STABLE_ID} = $unique_id;
-    $direct_xref->{ENSEMBL_TYPE} = $type;
-    $direct_xref->{LINKAGE_TYPE}='bla';
-    $direct_xref->{SYNONYMS} = $self->get_synonyms($unique_id);
-    $self->add_direct_xref($direct_xref);
-  }
-  return;
-}
-
-sub get_fields {
-  my ($item, $target) =@_;
-  if ($item =~ m/$target/){
-    $item =~ s/$target//g;
-    my @entries = map { uri_escape($_) } split(/,/, $item);
-    return \@entries;
-  }
-  return;
-}
-
-sub source_name_name{
-  my $self = shift;
-
-  $self->{_source_name_name} = shift if @_ ;
-  return $self->{_source_name_name};
-}
-
-sub source_name_name_prefix{
-  my $self = shift;
-
-  $self->{_source_name_name_prefix} = shift if @_ ;
-  return $self->{_source_name_name_prefix};
-}
-
-
-sub source_name_synonym{
-  my $self = shift;
-
-  $self->{_source_name_synonym} = shift if @_ ;
-  return $self->{_source_name_synonym};
-}
-
-
-sub source_name_fbgn{
-  my $self = shift;
-
-  $self->{_source_name_gene} = shift if @_ ;
-  return $self->{_source_name_gene};
-}
-
-
-sub source_name_gadfly_gene{
-  my $self = shift;
-
-  $self->{_source_name_gadfly_gene} = shift if @_ ;
-  return $self->{_source_name_gadfly_gene};
-}
-
-sub source_name_gadfly_transcript{
-  my $self = shift;
-
-  $self->{_source_name_gadfly_transcript} = shift if @_ ;
-  return $self->{_source_name_gadfly_transcript};
-}
-sub source_name_gadfly_translation{
-  my $self = shift;
-
-  $self->{_source_name_gadfly_translation} = shift if @_ ;
-  return $self->{_source_name_gadfly_translation};
-}
-
-
-sub source_name_fbtr{
-  my $self = shift;
-
-  $self->{_source_name_transcript} = shift if @_ ;
-  return   $self->{_source_name_transcript}  ;
-}
-
-sub source_name_fbpp{
-  my $self = shift;
-
-  $self->{_source_name_fbpp} = shift if @_ ;
-  return $self->{_source_name_fbpp};
-}
-
-sub source_name_fban{
-  my $self = shift;
-
-  $self->{_sn_fban} = shift if @_ ;
-  return $self->{_sn_fban};
-}
-
-sub source_name_affymetrix {
-  my $self = shift;
-  $self->{_sn_affymetrix} = shift if @_ ;
-  return $self->{_sn_affymetrix};
-}
-
-sub source_name_dgrc1 {
-  my $self = shift;
-  $self->{_sn_dgrc1} = shift if @_ ;
-  return $self->{_sn_dgrc1};
-}
-
-sub source_name_dgrc2 {
-  my $self = shift;
-  $self->{_sn_dgrc2} = shift if @_ ;
-  return $self->{_sn_dgrc2};
-}
-
-sub source_name_drsc {
-  my $self = shift;
-  $self->{_sn_drsc} = shift if @_ ;
-  return $self->{_sn_drsc};
-}
-
-sub source_name_epd {
-  my $self = shift;
-  $self->{_sn_epd} = shift if @_ ;
-  return $self->{_sn_epd};
-}
-
-sub source_name_flyreg {
-  my $self = shift;
-  $self->{_sn_flyreg} = shift if @_ ;
-  return $self->{_sn_flyreg};
-}
-
-sub source_name_gb {
-  my $self = shift;
-  $self->{_sn_gb} = shift if @_ ;
-  return $self->{_sn_gb};
-}
-
-sub source_name_gbprotein {
-  my $self = shift;
-  $self->{_sn_gbprotein} = shift if @_ ;
-  return $self->{_sn_gbprotein};
-}
-
-sub source_name_gcr {
-  my $self = shift;
-  $self->{_sn_gcr} = shift if @_ ;
-  return $self->{_sn_gcr};
-}
-sub source_name_gi {
-  my $self = shift;
-  $self->{_sn_gi} = shift if @_ ;
-  return $self->{_sn_gi};
-}
-sub source_name_go {
-  my $self = shift;
-  $self->{_sn_go} = shift if @_ ;
-  return $self->{_sn_go};
-}
-
-sub source_name_genomeRNAi {
-  my $self = shift;
-  $self->{_sn_genomeRNAi} = shift if @_ ;
-  return $self->{_sn_genomeRNAi};
-}
-
-sub source_name_interpro {
-  my $self = shift;
-  $self->{_sn_interpro} = shift if @_ ;
-  return $self->{_sn_interpro};
-}
-
-sub source_name_merops {
-  my $self = shift;
-  $self->{_sn_merops} = shift if @_ ;
-  return $self->{_sn_merops};
-}
-
-sub source_name_miRBase {
-  my $self = shift;
-  $self->{_sn_miRBase} = shift if @_ ;
-  return $self->{_sn_miRBase};
-}
-
-sub source_name_mitodrome {
-  my $self = shift;
-  $self->{_sn_mitodrome} = shift if @_ ;
-  return $self->{_sn_mitodrome};
-}
-
-sub source_name_nrl3d {
-  my $self = shift;
-  $self->{_sn_nrl3d} = shift if @_ ;
-  return $self->{_sn_nrl3d};
-}
-
-sub source_name_pdb {
-  my $self = shift;
-  $self->{_sn_pdb} = shift if @_ ;
-  return $self->{_sn_pdb};
-}
-
-sub source_name_rfam {
-  my $self = shift;
-  $self->{_sn_rfam} = shift if @_ ;
-  return $self->{_sn_rfam};
-}
-
-sub source_name_tf {
-  my $self = shift;
-  $self->{_sn_tf} = shift if @_ ;
-  return $self->{_sn_tf};
-}
-
-sub source_name_uniprotsp {
-  my $self = shift;
-  $self->{_sn_uniprotsp} = shift if @_ ;
-  return $self->{_sn_uniprotsp};
-}
-
-sub source_name_uniprottr {
-  my $self = shift;
-  $self->{_sn_uniprottr} = shift if @_ ;
-  return $self->{_sn_uniprottr};
-}
-
-sub source_name_bdgpinsituexpr {
-  my $self = shift;
-  $self->{_sn_bdgpinsituexpr} = shift if @_ ;
-  return $self->{_sn_bdgpinsituexpr};
-}
-
-sub source_name_dedb {
-  my $self = shift;
-  $self->{_sn_dedb} = shift if @_ ;
-  return $self->{_sn_dedb};
-}
-
-sub source_name_drosdel {
-  my $self = shift;
-  $self->{_sn_drosdel} = shift if @_ ;
-  return $self->{_sn_drosdel};
-}
-
-sub source_name_flygrid {
-  my $self = shift;
-  $self->{_sn_flygrid} = shift if @_ ;
-  return $self->{_sn_flygrid};
-}
-
-sub source_name_hybrigenics {
-  my $self = shift;
-  $self->{_sn_hybrigenics} = shift if @_ ;
-  return $self->{_sn_hybrigenics};
-}
-
-sub source_name_if {
-  my $self = shift;
-  $self->{_sn_if} = shift if @_ ;
-  return $self->{_sn_if};
-}
-
-sub source_name_prefix_ensAGgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensAG} = shift if @_ ;
-  return $self->{_sn_prefix_ensAG};
-}
-
-sub source_name_prefix_ensAMgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensAM} = shift if @_ ;
-  return $self->{_sn_prefix_ensAM};
-}
-
-sub source_name_prefix_ensCEgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensCE} = shift if @_ ;
-  return $self->{_sn_prefix_ensCE};
-}
-
-sub source_name_prefix_ensCFgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensCF} = shift if @_ ;
-  return $self->{_sn_prefix_ensCF};
-}
-
-sub source_name_prefix_ensDMgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensDM} = shift if @_ ;
-  return $self->{_sn_prefix_ensDM};
-}
-
-sub source_name_prefix_ensDRgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensDR} = shift if @_ ;
-  return $self->{_sn_prefix_ensDR};
-}
-
-sub source_name_prefix_ensFRgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensFR} = shift if @_ ;
-  return $self->{_sn_prefix_ensFR};
-}
-
-sub source_name_prefix_ensGGgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensGG} = shift if @_ ;
-  return $self->{_sn_prefix_ensGG};
-}
-
-sub source_name_prefix_ensHSgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensHS} = shift if @_ ;
-  return $self->{_sn_prefix_ensHS};
-}
-
-sub source_name_prefix_ensMMgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensMM} = shift if @_ ;
-  return $self->{_sn_prefix_ensMM};
-}
-
-sub source_name_prefix_ensPTgene {
-  my $self = shift;
-  $self->{_sn_prefix_ensPT} = shift if @_ ;
-  return $self->{_sn_prefix_ensPT};
-}
-
-sub source_name_prefix_ensRNgene {
-  my $self = shift;
-  $self->{_sn_ensRN} = shift if @_ ;
-  return $self->{_sn_ensRN};
-}
-
-sub source_name_prefix_ensTNgene {
-  my $self = shift;
-  $self->{_sn_ensTN} = shift if @_ ;
-  return $self->{_sn_ensTN};
-}
-
-sub source_name_prefix_modCBgene {
-  my $self = shift;
-  $self->{_sn_modCB} = shift if @_ ;
-  return $self->{_sn_modCB};
-}
-
-sub source_name_prefix_modCEgene {
-  my $self = shift;
-  $self->{_sn_modCE} = shift if @_ ;
-  return $self->{_sn_modCE};
-}
-
-sub source_name_prefix_modDDgene {
-  my $self = shift;
-  $self->{_sn_modDD} = shift if @_ ;
-  return $self->{_sn_modDD};
-}
-
-sub gff_name{
-  my $self = shift;
-  $self->{_gff_name} = shift if @_ ;
-  return $self->{_gff_name};
-}
-
-sub gff_dbxref{
-  my $self = shift;
-  $self->{_gff_dbxref} = shift if @_ ;
-  return $self->{_gff_dbxref};
-}
-
-sub gff_synonym{
-  my $self = shift;
-  $self->{_gff_synonym} = shift if @_ ;
-  return $self->{_gff_synonym};
-}
-
-sub gff_ontology{
-  my $self = shift;
-  $self->{_gff_ontology} = shift if @_ ;
-  return $self->{_gff_ontology};
-}
-
-sub species_id {
-  my $self = shift;
-  $self->{_species_id} = shift if @_ ;
-  return $self->{_species_id};
-}
-
-sub xrefs{
-  my $self = shift;
-
-  $self->{_xrefs} = shift if @_ ;
-  return $self->{_xrefs};
-}
-
-sub add_xref{
-    my ($self,$add_xref) = @_;
-    push @{$self->xrefs() }, $add_xref;
-    return;
-}
-
-
-sub direct_xrefs{
-  my $self = shift;
-
-  $self->{_direct_xrefs} = shift if @_ ;
-  return $self->{_direct_xrefs};
-}
-
-sub add_direct_xref{
-    my ($self,$dr) = @_;
-
-    push @{$self->direct_xrefs() }, $dr;
-    return;
-}
-
-
-
-
-
-sub line_contains_object_to_process{
-  my ($self,$type_of_line) = @_;  # shoud be mRNA, gene, pseudogene, CDS,...
-
-  for my $check_types ( @{$self->gff_object_types}) {
-    if ($check_types =~/^$type_of_line$/){
-      return 1;
-    }
-  }
-  return 0;
-}
-
-
-=pod
-
-=head2 gff_object_types
-
-  Title       : gff_object_types
-  Usage       : $obj->gff_object_types(array-ref)
-  Function    : contains gff-type-identifiers of gff-objects which have to be processed
-  Arguments   : array-ref
-  Return-Val  : array-ref
-
-=cut
-
-sub gff_object_types{
-  my $self = shift;
-
-  $self->{_gff_object_types} = shift if @_ ;
-  return $self->{_gff_object_types};
-}
-
-
-=pod
-
-=head2 external_source_db_name
-
-  Title       : external_source_db_name
-  Usage       : $obj->external_source_db_name(external db name)
-  Function    : returns name of hardcoded external source db name 
-  Arguments   : external db name
-  Return-Val  : string
-
-=cut
-
-sub external_source_db_name{
-  my $self = shift;
-
-  $self->{_external_source_db_name} = shift if @_ ;
-  return $self->{_external_source_db_name};
-}
-
-
-
-# --------------------------------------------------------------------------------
-# Get species (id and name) from file
-# For UniProt files the filename is the taxonomy ID
-
-sub get_species {
-  my ($self, $file) = @_;
-  my ($taxonomy_id, $extension) = split(/\./, basename($file));
-  my $sth = $self->dbi()->prepare("SELECT species_id,name FROM species WHERE taxonomy_id=?");
-  $sth->execute($taxonomy_id);
-  my ($species_id, $species_name);
-  while(my @row = $sth->fetchrow_array()) {
-    $species_id = $row[0];
-    $species_name = $row[1];
-  }
-  $sth->finish;
-
-  if (defined $species_name) {
-    print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n" if($verbose);
-  } else {
-    throw("Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n");
-  }
-
-  return ($species_id, $species_name);
-
-}
-
-sub add_synonym{
-  my ($self,$unique_id,$synref) = @_;
-  #print "adding synonym for -$unique_id-:".join(" " , @$synref)."\n" ; ;
-  ${$self->synonyms}{$unique_id}=$synref   if($synref);
-  return;
-}
-
-
-sub get_synonyms{
-  my ($self,$unique_id) = @_;
-
-  return ${$self->synonyms}{$unique_id};
-}
-
-
-sub synonyms{
-  my $self = shift;
-  $self->{_synonyms} = shift if @_ ;
-  return $self->{_synonyms};
-}
-
-
-
-
-sub gene_types{
-  my $self = shift;
-
-  $self->{_gene_types} = shift if @_ ;
-  return $self->{_gene_types};
-}
-
-sub transcript_types{
-  my $self = shift;
-
-  $self->{_trans_types} = shift if @_ ;
-  return $self->{_trans_types};
-}
-
-sub translation_types{
-  my $self = shift;
-
-  $self->{_tl_types} = shift if @_ ;
-  return $self->{_tl_types};
-}
-
- 1;
-
- #  Drosophila v5.3 : xrefs 
- #  Gff_file        external_db_id  db_name
- #  ==
- #  Affymetrix      3120    AFFY_DrosGenome1
- #  DGRC-1  830     DGRC-1 
- #  DGRC-2  831     DGRC-2 
- #  DRSC    840     DRSC
- #  EPD     10100   EPD 
- #  FlyBase 800     flybase_gene_id
- #  FlyBase_Annotation_IDs  804     flybase_annotation_id
- #  FlyReg  850     FlyReg
- #  GB      700     EMBL
- #  GB_protein      1700    protein_id
- #  GCR     10200   GPCR
- #  GI      10900   GI
- #  GO      1000    GO
- #  GenomeRNAi      860     GenomeRNAi
- #  INTERPRO        1200    Interpro
- #  MEROPS  10300   MEROPS 
- #  MIR     10400   miRBase 
- #  MITODROME       870     MitoDrome 
- #  NRL_3D  1600    PDB
- #  PDB     1600    PDB
- #  Rfam    4200    RFAM
- #  TF      10500   TransFac 
- #  UniProt/Swiss-Prot      2200    Uniprot/SWISSPROT
- #  UniProt/TrEMBL  2000    Uniprot/SPTREMBL
- #  bdgpinsituexpr  880     BDGP_insitu_expr 
- #  dedb    890     DEDb 
- #  drosdel 881     DrosDel 
- #  flygrid 882     FlyGrid 
- #  hybrigenics     883     hybrigenics
- #  if      884     InteractiveFly
- #  ensAG   6600    Ens_Ag_gene # Anopheles gambiae
- #  ensAM   6630    Ens_Am_gene # apis mellifera?
- #  ensCE   6660    Ens_Ce_gene # C Elegans
- #  ensCF   5700    Ens_Cf_gene # Canis familiaris
- #  ensDM   6690    Ens_Dm_gene #
- #  ensDR   5800    Ens_Dr_gene # Danio rerio
- #  ensFR   6720    Ens_Fr_gene # Takifugu rubripes
- #  ensGG   6400    Ens_Gg_gene # Gallus gallus
- #  ensHS   2700    Ens_Hs_gene # Homo sapiens
- #  ensMM   5000    Ens_Mm_gene # mus musculus
- #  ensPT   6750    Ens_Pt_gene # Pan troglodytes
- #  ensRN   6200    Ens_Rn_gene # Rattus norvegicus
- #  ensTN   6810    Ens_Tn_gene # Tetraodon nigroviridis
- #  modCB   10600   modCB # InParanoid Model organism database, Caenorhabditis briggsae
- #  modCE   10700   modCE # Caenorhabditis elegans
- #  modDD   10800   modDD # Dictyostelium discoideum
-- 
GitLab