HGNCParser.pm 8.11 KB
Newer Older
Ian Longden's avatar
Ian Longden committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
package XrefParser::HGNCParser;

use strict;
use File::Basename;

use base qw( XrefParser::BaseParser );

my $xref_sth ;
my $dep_sth;
my $syn_sth;

# --------------------------------------------------------------------------------
# Parse command line and run if being run directly

if (!defined(caller())) {

  if (scalar(@ARGV) != 1) {
18
    print STDERR "\nUsage: HGNCParser.pm file <source_id> <species_id>\n\n";
Ian Longden's avatar
Ian Longden committed
19 20 21 22 23 24 25 26 27 28 29 30
    exit(1);
  }

  run(@ARGV);
}

sub run {

  my $self = shift if (defined(caller(1)));

  my $source_id = shift;
  my $species_id = shift;
31 32 33 34 35
  my $files_ref  = shift;
  my $rel_file   = shift;
  my $verbose = shift;

  my $file = @{$files_ref}[0];
Ian Longden's avatar
Ian Longden committed
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61

  if(!defined($source_id)){
    $source_id = XrefParser::BaseParser->get_source_id_for_filename($file);
  }
  if(!defined($species_id)){
    $species_id = XrefParser::BaseParser->get_species_id_for_filename($file);
  }


  my $hgnc_refseq_manual = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","refseq_manual");
  if(!defined($hgnc_refseq_manual)){
    die  "Could not get source id for HGNC with priority description of refseq_manual\n";
  }
  my $hgnc_refseq_mapped = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","refseq_mapped");
  if(!defined($hgnc_refseq_mapped)){
    die  "Could not get source id for HGNC with priority description of refseq_mapped\n";
  }

  my $hgnc_entrezgene_manual  = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","entrezgene_manual");
  if(!defined($hgnc_entrezgene_manual)){
    die  "Could not get source id for HGNC with priority description of entrezgene_manual\n";
  }
  my $hgnc_entrezgene_mapped  = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","entrezgene_mapped");
  if(!defined($hgnc_entrezgene_mapped)){
    die  "Could not get source id for HGNC with priority description of entrezgene_mapped\n";
  }
62

63
  my $hgnc_ensembl_mapped  = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","ensembl_manual");
64
  if(!defined($hgnc_ensembl_mapped)){
65
    die  "Could not get source id for HGNC with priority description of ensembl_manual\n";
66
  }
Ian Longden's avatar
Ian Longden committed
67

68 69 70 71 72
  my $hgnc_desc_only  = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","desc_only");
  if(!defined($hgnc_desc_only)){
    die  "Could not get source id for HGNC with priority description of desc_only\n";
  }

Ian Longden's avatar
Ian Longden committed
73 74 75 76 77 78 79 80 81
#  my (%swiss)  =  %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)};
  my (%refseq) =  %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)};
  my @list;
  push @list, "refseq_peptide";
  push @list, "refseq_dna";
  my (%entrezgene) = %{XrefParser::BaseParser->get_valid_xrefs_for_dependencies("EntrezGene",@list)};

  my $refseq_count = 0;
  my $entrezgene_count = 0;
82
  my $ensembl_count = 0;
Ian Longden's avatar
Ian Longden committed
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
  my $mismatch = 0;

  my $hugo_io = $self->get_filehandle($file);

  if ( !defined $hugo_io ) {
    print "ERROR: Can't open HGNC file $file\n";
    return 1;
  }

  $_ = $hugo_io->getline();

  while ( $_ = $hugo_io->getline() ) {

    chomp;

    # 0 HGNC ID	           # primary accession
    # 1 Approved Symbol    # label
    # 2 Approved Name      # description
    # 3 Previous Symbols   # synonyms
    # 4 Aliases            # aliases
    # 5 entrezgene ID   manually curated
    # 6 RefSeq ID       manually curated
    # 7 entrezgene ID   mapped
    # 8 RefSeq ID       mapped
107
    # 9 Ensembl ID     manual
Ian Longden's avatar
Ian Longden committed
108 109 110 111 112 113 114

    my @array = split(/\t/,$_);

    # Use the RefSeq if available as this is manually curated
    # If no RefSeq, use the Swissprot instead

    my $seen = 0;
Ian Longden's avatar
Ian Longden committed
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135

    # store as list_only first 

    $self->add_xref($array[0], "", $array[1], $array[2], $hgnc_desc_only, $species_id, "MISC");      
    if (defined($array[3])) {     # dead name, add to synonym
      my @array2 = split(',\s*', $array[3]);
      foreach my $arr (@array2){
	XrefParser::BaseParser->add_to_syn($array[0], $hgnc_desc_only, $arr, $species_id);
      }
    }
    
    if (defined($array[4])) {     # alias, add to synonym
      my @array2 = split(',\s*', $array[4]);
      foreach my $arr (@array2){
	XrefParser::BaseParser->add_to_syn($array[0], $hgnc_desc_only, $arr, $species_id);
      }
    }




136 137 138 139 140 141 142 143
    if ($array[9]){              # Ensembl direct xref
      $seen =1;
      $ensembl_count++;
      XrefParser::BaseParser->add_to_direct_xrefs($array[9],'gene', $array[0], '', $array[1], $array[2], "", $hgnc_ensembl_mapped, $species_id);

      if (defined($array[3])) {     # dead name, add to synonym
	my @array2 = split(',\s*', $array[3]);
	foreach my $arr (@array2){
144
	  XrefParser::BaseParser->add_to_syn($array[0], $hgnc_ensembl_mapped, $arr, $species_id);
145 146 147 148 149 150
	}
      }
      
      if (defined($array[4])) {     # alias, add to synonym
	my @array2 = split(',\s*', $array[4]);
	foreach my $arr (@array2){
151
	  XrefParser::BaseParser->add_to_syn($array[0], $hgnc_ensembl_mapped, $arr, $species_id);
152 153 154 155
	}
      }
      
    }
Ian Longden's avatar
Ian Longden committed
156 157 158 159 160 161 162 163 164
    if ($array[6]) {             # RefSeq
      if(defined($refseq{$array[6]})){
	$seen = 1;
	$refseq_count++;
	XrefParser::BaseParser->add_to_xrefs($refseq{$array[6]}, $array[0], '', $array[1], $array[2], "", $hgnc_refseq_manual, $species_id);

	if (defined($array[3])) {     # dead name, add to synonym
	  my @array2 = split(',\s*', $array[3]);
	  foreach my $arr (@array2){
165
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
166 167 168 169 170 171
	  }
	}

	if (defined($array[4])) {     # alias, add to synonym
	  my @array2 = split(',\s*', $array[4]);
	  foreach my $arr (@array2){
172
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
173 174 175 176 177 178 179 180 181 182 183 184 185
	  }
	}
      }
    }
    if ($array[8]) {             # RefSeq
      if(defined($refseq{$array[8]})){
	$seen = 1;
	$refseq_count++;
	XrefParser::BaseParser->add_to_xrefs($refseq{$array[8]}, $array[0], '', $array[1], $array[2], "", $hgnc_refseq_mapped, $species_id);

	if (defined($array[3])) {     # dead name, add to synonym
	  my @array2 = split(',\s*', $array[3]);
	  foreach my $arr (@array2){
186
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
187 188 189 190 191 192
	  }
	}

	if (defined($array[4])) {     # alias, add to synonym
	  my @array2 = split(',\s*', $array[4]);
	  foreach my $arr (@array2){
193
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
194 195 196 197 198 199 200 201 202 203 204 205 206 207
	  }
	}
      }
    }

    if(defined($array[5])){
      if(defined($entrezgene{$array[5]})){
	$seen = 1;
	XrefParser::BaseParser->add_to_xrefs($entrezgene{$array[5]}, $array[0], '', 
					     $array[1], $array[2], "", $hgnc_entrezgene_manual, $species_id);
	$entrezgene_count++;
	if (defined($array[3])) {     # dead name, add to synonym
	  my @array2 = split(',\s*', $array[3]);
	  foreach my $arr (@array2){
208
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
209 210 211 212 213 214
	  }
	}
	
	if (defined($array[4])) {     # alias, add to synonym
	  my @array2 = split(',\s*', $array[4]);
	  foreach my $arr (@array2){
215
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
216 217
	  }
	}
218
      }   
Ian Longden's avatar
Ian Longden committed
219 220 221 222 223 224 225 226 227 228 229
    }

    if(defined($array[7])){
      if(defined($entrezgene{$array[7]})){
	$seen = 1;
	XrefParser::BaseParser->add_to_xrefs($entrezgene{$array[7]}, $array[0], '', 
					     $array[1], $array[2], "", $hgnc_entrezgene_mapped, $species_id);
	$entrezgene_count++;
	if (defined($array[3])) {     # dead name, add to synonym
	  my @array2 = split(',\s*', $array[3]);
	  foreach my $arr (@array2){
230
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
231 232 233 234 235 236
	  }
	}
	
	if (defined($array[4])) {     # alias, add to synonym
	  my @array2 = split(',\s*', $array[4]);
	  foreach my $arr (@array2){
237
	    XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr, $species_id);
Ian Longden's avatar
Ian Longden committed
238 239 240 241
	  }
	}
      }    
    }
Ian Longden's avatar
Ian Longden committed
242
    if(!$seen){
243
      $mismatch++;
Ian Longden's avatar
Ian Longden committed
244 245 246 247 248
    }


  } # while HGNC

249 250 251 252 253
  $hugo_io->close();
  
  print "Loaded a total of " . ($refseq_count + $entrezgene_count) . " HGNC xrefs, $refseq_count from RefSeq curated mappings and $entrezgene_count from EntrezGene mappings and $ensembl_count from ensembl_mapping\n" if($verbose);
  
  print "$mismatch xrefs could not be associated via RefSeq, EntrezGene or ensembl\n" if($verbose);
Ian Longden's avatar
Ian Longden committed
254 255 256 257 258 259 260 261 262 263 264 265

  return 0; # successful

}

sub rename_url_file{
  return "hugo.txt";
}

1;