diff --git a/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm b/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm index 5769b8da38e99bea92ae9381664953152deb4a71..7fb3838057248bb7aa082c42b6793e36fff4020a 100644 --- a/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm @@ -3,10 +3,7 @@ package XrefParser::AedesGenBankParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); #Aedes GenBank protein - because not yet in UniProt #>EAT48991.1 @@ -25,11 +22,13 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $file_io = $self->get_filehandle($file); + if ( !defined $file_io ) { + print "Could not open $file\n"; + return 1; } - while (<FILE>) { + + while ( $_ = $file_io->getline() ) { my $xref; @@ -73,7 +72,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " AedesGenBank xrefs succesfully parsed\n"; @@ -83,13 +82,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::AedesGenBankParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm b/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm index 08173417265d2683008e51b616bec4be5d21cbca..a9a37f555cb1d5e41ad41a246d58de310e82f52c 100644 --- a/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm @@ -3,10 +3,7 @@ package XrefParser::AgilentParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # OParser for FASTA-format probe mappings from Agilent # >A_23_P253586 @@ -22,12 +19,15 @@ sub run { # local $/ = "\n>"; - if(!open(AG,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $ag_io = $self->get_filehandle($file); + + if ( !defined $ag_io ) { + print "Could not open $file\n"; + return 1; } + my $probe; - while (<AG>) { + while ( $_ = $ag_io->getline() ) { chomp; @@ -57,7 +57,7 @@ sub run { } } - close(AG); + $ag_io->close(); print scalar(@xrefs) . " Agilent xrefs succesfully parsed\n"; @@ -67,13 +67,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::AgilentParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm b/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm index eaf5a838e70363cc259f65b7db79a6533c996433..641b724dfb620f7f435044cf89f56f8c939d24fa 100644 --- a/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm @@ -3,10 +3,7 @@ package XrefParser::AnophelesSymbolParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # AnophelesSymbol database dump for anopheles - FASTA format # @@ -25,12 +22,14 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "Could not open $file\n"; + return 1; } - while (<FILE>) { + while ( $_ = $file_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -55,7 +54,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " AnophelesSymbol xrefs succesfully parsed\n"; @@ -65,13 +64,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::AnophelesSymbolParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/BaseParser.pm b/misc-scripts/xref_mapping/XrefParser/BaseParser.pm index 9b368624afbabd535f4a0c0211e053980d4769a4..ebcaf7246b1ec8e5d782da4b1ca77913d90fbaa4 100644 --- a/misc-scripts/xref_mapping/XrefParser/BaseParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/BaseParser.pm @@ -27,16 +27,27 @@ my %dependent_sources; my %taxonomy2species_id; my %name2species_id; -my ($host, $port, $dbname, $user, $pass, $create, $release, $cleanup, $deletedownloaded); -my ($skipdownload,$drop_db,$checkdownload, $dl_path) ; +my ( + $host, $port, $dbname, + $user, $pass, $create, + $release, $cleanup, $deletedownloaded, + $skipdownload, $drop_db, $checkdownload, + $dl_path, $compressed +); # -------------------------------------------------------------------------------- # Get info about files to be parsed from the database -sub run { - - ($host, $port, $dbname, $user, $pass, my $speciesr, my $sourcesr, $skipdownload, $checkdownload, - $create, $release, $cleanup, $drop_db, $deletedownloaded, $dl_path, my $notsourcesr) = @_; +sub run +{ + ( + $host, $port, $dbname, + $user, $pass, my $speciesr, + my $sourcesr, $skipdownload, $checkdownload, + $create, $release, $cleanup, + $drop_db, $deletedownloaded, $dl_path, + my $notsourcesr, $compressed + ) = @_; $base_dir = $dl_path if $dl_path; @@ -195,8 +206,8 @@ sub run { if ($checkdownload) { my $check_file = $dir . '/' . $file; - $check_file =~ s/\.gz$//; - $check_file =~ s/\.Z$//; + + if ( !$compressed ) { $check_file =~ s/\.(gz|Z)$// } print "Checking for file '$check_file'\n"; @@ -206,8 +217,7 @@ sub run { $skipdownload = 1; - $file =~ s/\.gz$//; - $file =~ s/\.Z$//; + if ( !$compressed ) { $file =~ s/\.(gz|Z)$// } } else { print "File '$check_file' does not exist.\n" . "Scheduling '$dir/$file' for download...\n"; @@ -265,22 +275,27 @@ sub run { croak("Could not get $type file $file tried 5 times but failed"); } - # if the file is compressed, the FTP server may or may not have automatically uncompressed it - # TODO - read .gz file directly? open (FILE, "zcat $file|") or Compress::Zlib - if ($file =~ /(.*)\.gz$/ or $file =~ /(.*)\.Z$/) { - print "Uncompressing $dir/$file\n"; - system("gunzip -f $dir/$file"); - $file = $1; - } - if ($file =~ /(.*)\.zip$/) { - print "Unzipping $dir/$file\n"; - system("unzip -o -q -d $dir $dir/$file"); - } + # If the file is compressed, the FTP server may or may not have + # automatically uncompressed it (it shouldn't have, is this an + # historical artifact? (ak)). + + if ( !$compressed && ( $file =~ /\.(gz|Z)$/ ) ) { + print "Uncompressing '$dir/$file' using 'gunzip'\n"; + system( "gunzip", "-f", $dir . '/' . $file ); + } + if ( $file =~ /(.*)\.zip$/ ) { + print "Uncompressing '$dir/$file' using 'unzip'\n"; + system( "unzip", "-o", "-q", "-d", $dir, + $dir . '/' . $file ); + } } - $file =~s/\.gz$//; # if skipdownload set this will not have been done yet. - $file=~s/\.Z$//; # if it has no harm done + if ( !$compressed ) { + $file =~ s/\.(gz|Z)$//; # If skipdownload set this will + # not have been done yet. + # If it has, no harm done + } if ($file_from_archive) { push @new_file, $file_from_archive; @@ -361,13 +376,57 @@ sub run { # -------------------------------------------------------------------------------- -sub new { +# Given a file name, returns a IO::Handle object. If the file is +# gzipped, the handle will be to an unseekable stream coming out of a +# zcat pipe. If the given file name doesn't correspond to an existing +# file, the routine will try to add '.gz' to the file name or to remove +# any .'Z' or '.gz' and try again. Returns undef on failure and will +# write a warning to stderr. + +sub get_filehandle +{ + my ($self, $file_name) = @_; + + my $io; + + my $alt_file_name = $file_name; + $alt_file_name =~ s/\.(gz|Z)$//; - my $self = {}; - bless $self, "BaseParser"; + if ( $alt_file_name eq $file_name ) { + $alt_file_name .= '.gz'; + } + + if ( !-f $file_name ) { + carp( "File '$file_name' does not exist, " + . "will try '$alt_file_name'" ); + $file_name = $alt_file_name; + } + + if ( $file_name =~ /\.(gz|Z)$/ ) { + # Read from zcat pipe + $io = IO::File->new("zcat $file_name |") + or carp("Can not open file '$file_name' with 'zcat'"); + } else { + # Read file normally + $io = IO::File->new($file_name) + or carp("Can not open file '$file_name'"); + } + + if ( !defined $io ) { return undef } - return $self; + print "Reading from '$file_name'...\n"; + + return $io; +} + +# -------------------------------------------------------------------------------- + +sub new +{ + my ($proto) = @_; + my $class = ref $proto || $proto; + return bless {}, $class; } # -------------------------------------------------------------------------------- diff --git a/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm b/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm index 877a5995a30dd01ba6ab373fb14c430c40d38577..e5fdc637b6f9bc760da46eb5c83d139b9b8a329e 100644 --- a/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of CCDS records and assign direct xrefs # All assumed to be linked to transcripts @@ -18,17 +15,19 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(CCDS,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $ccds_io = $self->get_filehandle($file); + + if ( !defined $ccds_io ) { + print "Could not open $file\n"; + return 1; } + my $line_count = 0; my $xref_count = 0; my $xref_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND version=? AND source_id=$source_id AND species_id=$species_id"); - while (<CCDS>) { - + while ( $_ = $ccds_io->getline() ) { my ($stable_id, $ccds) = split; my ($acc, $version) = split (/\./, $ccds); @@ -48,17 +47,8 @@ sub run { print "Parsed $line_count CCDS identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n"; - close(CCDS); + $ccds_io->close(); return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::CCDSParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm b/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm index ab51ee815dc8fb9a446c713061b410a706b8064a..0d654293977f1263243e84416e5d79aa25d3f1b4 100644 --- a/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm @@ -3,10 +3,7 @@ package XrefParser::CeleraParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Celera database dump for anopheles - FASTA format # @@ -27,12 +24,14 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { print "Could not open $file\n"; return 1; } - while (<FILE>) { + while ( $_ = $file_io->getline() ) { next if (/^File:/); # skip header my $xref; @@ -69,7 +68,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " Celera xrefs succesfully parsed\n"; @@ -79,13 +78,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::CeleraParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm b/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm index 1325537fb71d7a0710bba4578deccc22a3eae471..a51cc21403bc9936c22b9026053c23501c817716 100644 --- a/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm @@ -2,26 +2,13 @@ package XrefParser::CeleraProteinParser; use strict; -use XrefParser::CeleraParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::CeleraParser); +use base qw( XrefParser::CeleraParser ); # See CeleraParser for details -sub get_sequence_type() { - - return 'peptide'; - -} - - -sub new { - - my $self = {}; - bless $self, "XrefParser::CeleraProteinParser"; - return $self; - +sub get_sequence_type() +{ + return 'peptide'; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm b/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm index 99a2e8de6e36d1230e16cd85a7583d2f1b9844d6..fb08e9f14061990e92b097eaee3dbbec64b931de 100644 --- a/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm @@ -2,26 +2,13 @@ package XrefParser::CeleraTranscriptParser; use strict; -use XrefParser::CeleraParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::CeleraParser); +use base qw( XrefParser::CeleraParser ); # See CeleraParser for details -sub get_sequence_type() { - - return 'dna'; - -} - - -sub new { - - my $self = {}; - bless $self, "XrefParser::CeleraTranscriptParser"; - return $self; - +sub get_sequence_type() +{ + return 'dna'; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm b/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm index 433ce2e0437a6d5c3486870e654f63b82047f582..061ad01da65fd7556258af882f54c0abcd304466 100644 --- a/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm @@ -3,10 +3,7 @@ package XrefParser::CodelinkParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parser for Codelink probes @@ -23,13 +20,13 @@ sub run { local $/ = "\n>"; - if(!open(CODELINK,"<".$file)){ + my $codelink_io = $self->get_filehandle($file); + if ( !defined $codelink_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 = error + return 1; # 1 = error } - while (<CODELINK>) { - + while ( $_ = $codelink_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -53,6 +50,8 @@ sub run { } + $codelink_io->close(); + print scalar(@xrefs) . " Codelink xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); @@ -61,13 +60,4 @@ sub run { return 0; #successful } - -sub new { - - my $self = {}; - bless $self, "XrefParser::CodelinkParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm b/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm index a63494a011b6954d0404333cb6a635ae6f1799e8..ac72f18c90bcbb4d20fd7a5819a6663bedb7fc35 100644 --- a/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm @@ -3,10 +3,8 @@ package XrefParser::DatabaseParser; use strict; use DBI; -use XrefParser::BaseParser; -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Base class for parsers that parse from databases rather than files @@ -58,13 +56,5 @@ sub db { } -sub new { - - my $self = {}; - bless $self, "XrefParser::DatabaseParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm b/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm index c17780f3820468277b9a491a118219df82bd9e33..750e46e1d1de67b1061058c1f1180bc4ab14f6b8 100644 --- a/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm @@ -4,11 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,14 +37,15 @@ sub run { my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); - if(!open(EG,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error - } + my $eg_io = $self->get_filehandle($file); + if ( !defined $eg_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error + } - my $head = <EG>; # first record are the headers + my $head = $eg_io->getline(); # first record are the headers chomp $head; my (@arr) = split(/\s+/,$head); # process this to the correct indexes to use. (incase they change); @@ -92,7 +89,7 @@ sub run { } my $xref_count = 0; my $syn_count = 0; - while (<EG>) { + while ( $_ = $eg_io->getline() ) { chomp; my (@arr) = split(/\t/,$_); if($arr[$gene_tax_id_index] != $species_tax_id){ @@ -110,18 +107,12 @@ sub run { $syn_count++; } } + + $eg_io->close(); + print $xref_count." EntrezGene Xrefs added with $syn_count synonyms\n"; return 0; #successful } - - -sub new { - - my $self = {}; - bless $self, "XrefParser::EntrezGeneParser"; - return $self; - -} 1; diff --git a/misc-scripts/xref_mapping/XrefParser/FastaParser.pm b/misc-scripts/xref_mapping/XrefParser/FastaParser.pm index 44da5797de886d5f0ccfc5f5ddc4011a0c6077ff..ebde3171f07a6e76721aeee2701c0d6429b36ff1 100644 --- a/misc-scripts/xref_mapping/XrefParser/FastaParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/FastaParser.pm @@ -4,10 +4,7 @@ use strict; use Bio::SeqIO; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Fasta file format, e.g. # >foo peptide sequence for the foo gene @@ -55,13 +52,4 @@ sub run { return 0; #successful } - -sub new { - - my $self = {}; - bless $self, "XrefParser::FastaParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm index 653589e35a625c1f4ff374170ce9213bdb832d0d..31db32db96dec603e107e0fb83b9697e1cb97fe9 100644 --- a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm @@ -10,14 +10,12 @@ package XrefParser::Flybase_dmel_GFFv3_Parser; use strict; + use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; -use vars qw(@ISA); use Bio::EnsEMBL::Utils::Exception; - -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); my %cache_source =(); @@ -40,9 +38,8 @@ if (!defined(caller())) { # -------------------------------------------------------------------------------- sub new { - my ($class,@args) = @_; - my $self={}; - bless $self,$class; + my $proto = shift; + my $self = $proto->SUPER::new(@_); $self->external_source_db_name('flybase_gff'); @@ -165,11 +162,15 @@ sub create_xrefs { my ($self, $flybase_source_id, $file) = @_; print STDERR "starting to parse $file...." ; - if(!open(GFF, $file)){ + + my $gff_io = $self->get_filehandle($file); + + if ( !defined $gff_io ) { print "ERROR: Can't open the GFF file $file\n"; return 0; } - while (<GFF>) { + + while ( $_ = $gff_io->getline() ) { chomp; my @col = split /\s+/; if($col[3]){ @@ -202,7 +203,9 @@ sub create_xrefs { } } - close (GFF); + + $gff_io->close(); + return 1; } diff --git a/misc-scripts/xref_mapping/XrefParser/GOParser.pm b/misc-scripts/xref_mapping/XrefParser/GOParser.pm index 00869d3d6f01deca7af70617787f059c7a6e55d2..eac9d917f39f7828b875476065febf902a37bf28 100644 --- a/misc-scripts/xref_mapping/XrefParser/GOParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/GOParser.pm @@ -6,12 +6,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -63,13 +58,16 @@ sub run { my $count = 0; - if(!open(GO,"<".$file)){ + my $go_io = $self->get_filehandle($file); + + if ( !defined $go_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } + my $taxon_line = "taxon:".$species_id; my $miss =0; - while (<GO>) { + while ( $_ = $go_io->getline() ) { if(/$taxon_line/){ chomp; my @array = split (/\t/,$_); @@ -168,16 +166,11 @@ sub run { } } } + + $go_io->close(); + print "\t$count GO dependent xrefs added $refseq_miss refseq not found and $swiss_miss Swissprot not found \n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::GOParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm b/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm index 1ff87fa5405165339a79c5cd9e34b6426d85bc24..c207105f6f59a021f48aa228a9a5cf04612bb02f 100644 --- a/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm @@ -3,10 +3,8 @@ package XrefParser::HUGOParser; use strict; use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); my $xref_sth ; my $dep_sth; my $syn_sth; @@ -67,18 +65,20 @@ sub run { my $entrezgene_count = 0; my $mismatch = 0; - if(!open (HUGO, "<$file")){ - print "ERROR: Can't open HUGO file $file\n"; + my $hugo_io = $self->get_filehandle($file); + + if ( !defined $hugo_io ) { + print "ERROR: Can't open HUGO file $file\n"; return 1; } - <HUGO>; + $_ = $hugo_io->getline(); #23 ABAT 4-aminobutyrate aminotransferase P80404 #29 ABCA1 ATP-binding cassette, sub-family A (ABC1), member 1 ABC1, HDLDT1 O95477 #40 ABCB1 ATP-binding cassette, sub-family B (MDR/TAP), member 1 PGY1, MDR1, CLCS P-gp, CD243, GP170, ABC20 P08183 NM_000927 - while (<HUGO>) { + while ( $_ = $hugo_io->getline() ) { chomp; @@ -168,7 +168,7 @@ sub run { } # while HUGO - close (HUGO); + $hugo_io->getline(); print "Loaded a total of " . ($swiss_count + $refseq_count + $entrezgene_count) . " HUGO xrefs, $refseq_count from RefSeq curated mappings and $swiss_count from Uniprot (mapped) and $entrezgene_count from EntrezGene mappings\n"; @@ -182,15 +182,6 @@ sub rename_url_file{ return "hugo.txt"; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::HUGOParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm b/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm index 394cf49af58bd32fdb15e3bbdf3d3ea58ee15e26..ca172bf50fc9e812e9a35040305bfb78cd17eb84 100644 --- a/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of HGNC records and assign direct xrefs # All assumed to be linked to genes @@ -16,7 +13,9 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(HUGO,"<".$file)){ + my $hugo_io = $self->get_filehandle($file); + + if ( !defined $hugo_io ) { print "Could not open $file\n"; return 1; } @@ -68,7 +67,8 @@ sub run { my %seen; my $ignore_count = 0; my $ignore_examples =""; - while(<HUGO>){ + + while( $_ = $hugo_io->getline() ) { chomp; my ($ccds,$hgnc) = split; @@ -96,17 +96,9 @@ sub run { print $ignore_count." ignoreed due to numbers no identifiers being no longer valid :- $ignore_examples\n"; } - close(HUGO); - return 0; + $hugo_io->close(); -} - - -sub new { - - my $self = {}; - bless $self, "XrefParser::HUGO_CCDSParser"; - return $self; + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm b/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm index af972d801594adeb45f16927fee26acdd21691c0..664904ff7b2a76571c54d5d2d61567d325df6a4b 100644 --- a/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of HGNC records and assign direct xrefs # All assumed to be linked to genes @@ -16,15 +13,16 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(HUGO,"<".$file)){ + my $hugo_io = $self->get_filehandle($file); + + if ( !defined $hugo_io ) { print "Could not open $file\n"; return 1; } + my $line_count = 0; my $xref_count = 0; - - # becouse the direct mapping have no descriptions etc # we have to steal these fromt he previous HUGO parser. # This is why the order states this is after the other one. @@ -51,7 +49,8 @@ sub run { my $ignore_count = 0; my $ignore_examples =""; my %acc; - while (<HUGO>) { + + while ( $_ = $hugo_io->getline() ) { my ($hgnc, $stable_id) = split; @@ -79,17 +78,10 @@ sub run { if($ignore_count){ print $ignore_count." ignoreed due to numbers no identifiers being no longer valid :- $ignore_examples\n"; } - close(HUGO); - return 0; -} - -sub new { - - my $self = {}; - bless $self, "XrefParser::HUGO_ENSGParser"; - return $self; + $hugo_io->close(); + return 0; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/IPIParser.pm b/misc-scripts/xref_mapping/XrefParser/IPIParser.pm index e37d31b080ee72b58741766d77768aed89912cf3..0f62360f245130421eda59deebf2653fb691fdca 100644 --- a/misc-scripts/xref_mapping/XrefParser/IPIParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/IPIParser.pm @@ -3,10 +3,7 @@ package XrefParser::IPIParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # IPI file format: fasta, e.g. # >IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q15104|REFSEQ_NP:NP_002515|ENSEMBL:ENSP00000261444 Tax_Id=9606 Transforming protein N-Ras @@ -22,14 +19,16 @@ sub run { local $/ = "\n>"; - if(!open(IPI,"<".$file)){ + my $ipi_io = $self->get_filehandle($file); + + if ( !defined $ipi_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 = error + return 1; # 1 = error } - my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); - while (<IPI>) { + my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); + while ( $_ = $ipi_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -64,6 +63,8 @@ sub run { } + $ipi_io->close(); + print scalar(@xrefs) . " IPI xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); @@ -72,13 +73,4 @@ sub run { return 0; #successful } - -sub new { - - my $self = {}; - bless $self, "XrefParser::IPIParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm b/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm index 10be9f5d078614cb2fa17344807cf952764d792c..d959b579b031b67f1d0479714068d2007e5273e2 100644 --- a/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm @@ -2,10 +2,7 @@ package XrefParser::IlluminaParser; use strict; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parser for Illumina V2 xrefs - V1 are done by the vanilla FastaParser @@ -21,13 +18,14 @@ sub run { my @xrefs; - if(!open(FILE,"<".$file)){ + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { print "Could not open $file\n"; return 1; } - while (<FILE>) { - + while ( $_ = $file_io->getline() ) { chomp; my $xref; @@ -63,7 +61,7 @@ sub run { } - close(FILE); + $file_io->close(); print scalar(@xrefs) . " Illumina V2 xrefs succesfully parsed\n"; @@ -73,13 +71,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::IlluminaParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/InterproParser.pm b/misc-scripts/xref_mapping/XrefParser/InterproParser.pm index 31e7706bf63931b1438d7f05ebb24782f46ae31a..defaf5fd35a0c2f4ebe63c7337a1d1a8ab564f27 100644 --- a/misc-scripts/xref_mapping/XrefParser/InterproParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/InterproParser.pm @@ -4,11 +4,8 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - my $xref_sth ; my $dep_sth; @@ -57,15 +54,18 @@ sub run { my $dir = dirname($file); - + my %short_name; my %description; my %pfam; - - if(!open (XML, $dir."/interpro.xml")){ + + my $xml_io = $self->get_filehandle( $dir . "/interpro.xml" ); + + if ( !defined $xml_io ) { print "ERROR: Can't open hugo interpro file $dir/interpro.xml\n"; - return 1; # 1= error + return 1; # 1= error } + #<interpro id="IPR001023" type="Family" short_name="Hsp70" protein_count="1556"> # <name>Heat shock protein Hsp70</name> # <db_xref protein_count="18" db="PFAM" dbkey="PF01278" name="Omptin" /> @@ -77,8 +77,8 @@ sub run { my $last = ""; my $i =0; - while (<XML>) { + while ( $_ = $xml_io->getline() ) { my $interpro; my $short_name; my $name; @@ -109,7 +109,8 @@ sub run { } } - close (LONG); + $xml_io->close(); + for my $db ( keys %count ) { print "\t".$count{$db}." $db loaded.\n"; } @@ -126,14 +127,4 @@ sub get_xref{ return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::InterproParser"; - return $self; - -} - 1; - - diff --git a/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm b/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm index 58377e30112f66fb87eb592b748da973da528d76..4cb15cd0a24308d053b52e850132b2a053db0b28 100644 --- a/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm @@ -3,10 +3,7 @@ package XrefParser::JGI_Parser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # JGI protein file with gene predictons - FASTA FORMAT # @@ -50,11 +47,14 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error } - while (<FILE>) { + + while ( $_ = $file_io->getline() ) { next if (/^File:/); # skip header @@ -122,7 +122,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " JGI_ xrefs succesfully parsed\n"; @@ -133,11 +133,14 @@ sub run { } -sub new { - my $self = {}; - bless $self, "XrefParser::JGI_Parser"; - print "\n\nh ave new jp\n" ; - return $self; +sub new +{ + my $proto = shift; + my $self = $proto->SUPER::new(@_); + + print "\n\nhave new jp\n"; + + return $self; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm b/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm index 218e8fe9dcc09105b68b90521d412fbcee82d3a0..90afb658b74b3b137d8bfe75ed019e77c5cfabd1 100644 --- a/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm @@ -3,21 +3,12 @@ package XrefParser::JGI_ProteinParser; use strict; -use XrefParser::JGI_Parser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::JGI_Parser); +use base qw( XrefParser::JGI_Parser ); # See JGI_Parser for details + sub get_sequence_type() { return 'peptide'; } - -sub new { - my $self = {}; - bless $self, "XrefParser::JGI_ProteinParser"; - return $self; -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/MGDParser.pm b/misc-scripts/xref_mapping/XrefParser/MGDParser.pm index c5bb48cb4256e74b27776e489eb87a1522deba5d..85b9aa5dc9ee6e44e9ec6a4bf0e0c701442c591e 100644 --- a/misc-scripts/xref_mapping/XrefParser/MGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/MGDParser.pm @@ -4,11 +4,8 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - my $xref_sth ; my $dep_sth; @@ -51,11 +48,14 @@ sub run { my $mismatch = 0; my %mgi_good; - if(!open(FILE,"<". $file)){ - print "ERROR: Could not open file $file"; - return 1; # 1 is an error + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $file"; + return 1; # 1 is an error } - while(my $line = <FILE>){ + + while ( my $line = $file_io->getline() ) { chomp $line; my ($key,$label,$desc,$sps) = (split("\t",$line))[0,1,3,6]; my @sp = split(/\s/,$sps); @@ -70,19 +70,21 @@ sub run { } } } - close FILE; - + $file_io->close(); my $dir = dirname($file); my $syn_file = $dir."/MRK_Synonym.sql.rpt"; - if(!open(FILE2,"<". $syn_file)){ - print "ERROR: Could not open file $syn_file"; + $file_io = $self->get_filehandle($syn_file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $syn_file"; return 1; } + my $synonyms=0; - while(<FILE2>){ + while ( $_ = $file_io->getline() ) { if(/MGI:/){ chomp ; my ($key,$syn) = (split)[0,4]; @@ -92,23 +94,15 @@ sub run { } } } - close FILE2; + + $file_io->close(); + print "\t$count xrefs succesfully loaded\n"; print "\t$synonyms synonyms successfully loaded\n"; print "\t$mismatch xrefs failed to load\n"; return 0; - - -} - -sub new { - - my $self = {}; - bless $self, "XrefParser::MGDParser"; - return $self; - } - + 1; diff --git a/misc-scripts/xref_mapping/XrefParser/MIMParser.pm b/misc-scripts/xref_mapping/XrefParser/MIMParser.pm index a21828179fb02e4d7d2be7b63fa773ca6a13bacd..c634c09925bfd4ab0ce565d2da499c6cff08041b 100644 --- a/misc-scripts/xref_mapping/XrefParser/MIMParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/MIMParser.pm @@ -4,11 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -52,16 +48,21 @@ sub run { local $/ = "*RECORD*"; - if(!open(MIM,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error + my $mim_io = $self->get_filehandle($file); + + if ( !defined $mim_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error } - + my $gene = 0; my $phenotype = 0; my $removed_count =0; - <MIM>; # first record is empty with *RECORD* as the record seperator - while (<MIM>) { + + $mim_io->getline(); # first record is empty with *RECORD* as the + # record seperator + + while ( $_ = $mim_io->getline() ) { #get the MIM number my $number = 0; my $description = undef; @@ -101,6 +102,9 @@ sub run { } } } + + $mim_io->close(); + my $syn_count =0; foreach my $mim (keys %old_to_new){ my $old= $mim; @@ -118,14 +122,4 @@ sub run { return 0; #successful } - - -sub new { - - my $self = {}; - bless $self, "XrefParser::MIMParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm b/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm index 7d188c3a6ad423a749b67408f2b087a6524a8d19..a56b1542273569331c3a9d88f300911b7b3fa217 100644 --- a/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of Ensembl - Vega OTTT transcript mappings # ENST00000373795: OTTHUMT00000010392 @@ -19,17 +16,19 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(OTTT,"<".$file)){ + my $ottt_io = $self->get_filehandle($file); + + if ( !defined $ottt_io ) { print "Could not open $file\n"; return 1; } + my $line_count = 0; my $xref_count = 0; my $xref_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$source_id AND species_id=$species_id"); - while (<OTTT>) { - + while ( $_ = $ottt_io->getline() ) { my ($ens, $ottt) = split; $ens =~ s/://g; @@ -48,19 +47,12 @@ sub run { } - print "Parsed $line_count OTTT identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n"; + $ottt_io->close(); - close(OTTT); - return 0; -} - - -sub new { + print "Parsed $line_count OTTT identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n"; - my $self = {}; - bless $self, "XrefParser::OTTTParser"; - return $self; + return 0; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RGDParser.pm b/misc-scripts/xref_mapping/XrefParser/RGDParser.pm index 6eb169a3751a698a3d6a4e82a3a5ab377ab3e96a..8f147c02a872ba075f321c0725b3c4f496ac7232 100644 --- a/misc-scripts/xref_mapping/XrefParser/RGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RGDParser.pm @@ -4,10 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); my $xref_sth ; my $dep_sth; @@ -47,11 +44,14 @@ sub run { my (%refseq) = %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)}; - if(!open(RGD,"<".$file)){ + my $rgd_io = $self->get_filehandle($file); + + if ( !defined $rgd_io ) { print "ERROR: Could not open $file\n"; return 1; } - my $line = <RGD>; + + my $line = $rgd_io->getline(); chomp $line; my @linearr = split(/\t/,$line); @@ -74,7 +74,7 @@ sub run { my $count= 0; my $mismatch = 0; - while ($line = <RGD>) { + while ( $line = $rgd_io->getline() ) { chomp $line; my ($rgd, $symbol, $name, $refseq) = (split (/\t/,$line))[0,1,2,16]; my @nucs = split(/\,/,$refseq); @@ -94,6 +94,7 @@ sub run { } } } + if(!$done){ # print STDERR "$rgd FAILED for $failed_list\n"; $self->add_xref("RGD:".$rgd,"",$symbol,$name,$source_id,$species_id); @@ -101,18 +102,12 @@ sub run { } } + + $rgd_io->close(); + print "\t$count xrefs succesfully loaded and dependent on refseq\n"; print "\t$mismatch xrefs added but with NO dependencies\n"; return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::RGDParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm index 9a1392014ff6f605527331da9edaf06367d63b96..efad047380aed159c2d2a0c28e45132fa184c991 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm @@ -6,10 +6,7 @@ use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw( XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -49,8 +46,10 @@ sub run { my $pred_dna_source_id = XrefParser::BaseParser->get_source_id_for_source_name('RefSeq_dna_predicted'); print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id; RefSeq_dna_predicted source ID = $pred_dna_source_id\n"; + my $xrefs = + $self->create_xrefs( $peptide_source_id, $dna_source_id, + $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id ); - my $xrefs = create_xrefs($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id); if(!defined($xrefs)){ return 1; #error } @@ -68,8 +67,10 @@ sub run { # Slightly different formats sub create_xrefs { + my $self = shift; - my ($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id) = @_; + my ( $peptide_source_id, $dna_source_id, $pred_peptide_source_id, + $pred_dna_source_id, $file, $species_id ) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); @@ -78,10 +79,13 @@ sub create_xrefs { # my (%genemap) = %{XrefParser::BaseParser->get_valid_codes("mim_gene",$species_id)}; # my (%morbidmap) = %{XrefParser::BaseParser->get_valid_codes("mim_morbid",$species_id)}; - if(!open(REFSEQ, $file)){ + my $refseq_io = $self->get_filehandle($file); + + if ( !defined $refseq_io ) { print "ERROR: Can't open RefSeqGPFF file $file\n"; return undef; } + my @xrefs; local $/ = "\/\/\n"; @@ -109,7 +113,7 @@ sub create_xrefs { } - while (<REFSEQ>) { + while ( $_ = $refseq_io->getline() ) { my $xref; @@ -224,7 +228,7 @@ sub create_xrefs { } # while <REFSEQ> - close (REFSEQ); + $refseq_io->close(); print "Read " . scalar(@xrefs) ." xrefs from $file\n"; @@ -234,14 +238,4 @@ sub create_xrefs { # -------------------------------------------------------------------------------- -sub new { - - my $self = {}; - bless $self, "XrefParser::RefSeqGPFFParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm index 1557f53fc528708f91c132f7b411f5f729b39062..1625701a107635e9c9b0ff88feb4ab1d9884970e 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm @@ -6,10 +6,7 @@ use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -46,7 +43,10 @@ sub run { $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); } - my $xrefs = create_xrefs($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id); + my $xrefs = + $self->create_xrefs( $peptide_source_id, $dna_source_id, + $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id ); + if(!defined($xrefs)){ return 1; #error } @@ -65,21 +65,25 @@ sub run { # Slightly different formats sub create_xrefs { + my $self = shift; - my ($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id) = @_; + my ( $peptide_source_id, $dna_source_id, $pred_peptide_source_id, + $pred_dna_source_id, $file, $species_id ) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); - if(!open(REFSEQ, $file)){ - print "ERROR: Can't open RefSeq file $file\n"; - return undef; + my $refseq_io = $self->get_filehandle($file); + + if ( !defined $refseq_io ) { + print "ERROR: Can't open RefSeq file $file\n"; + return undef; } + my @xrefs; local $/ = "\n>"; - while (<REFSEQ>) { - + while ( $_ = $refseq_io->getline() ) { my $xref; my $entry = $_; @@ -92,7 +96,7 @@ sub create_xrefs { (my $gi, my $n, my $ref, my $acc, my $description) = split(/\|/, $header); my ($species, $mrna); - if ($file =~ /\.faa$/) { + if ($file =~ /\.faa(\.gz|\.Z)?$/) { ($mrna, $description, $species) = $description =~ /(\S*)\s+(.*)\s+\[(.*)\]$/; $xref->{SEQUENCE_TYPE} = 'peptide'; @@ -105,7 +109,7 @@ sub create_xrefs { } $xref->{SOURCE_ID} = $source_id; - } elsif ($file =~ /\.fna$/) { + } elsif ($file =~ /\.fna(\.gz|\.Z)?$/) { ($species, $description) = $description =~ /\s*(\w+\s+\w+)\s+(.*)$/; $xref->{SEQUENCE_TYPE} = 'dna'; @@ -126,8 +130,10 @@ sub create_xrefs { my $species_id_check = $name2species_id{$species}; # skip xrefs for species that aren't in the species table - if (defined($species_id) and $species_id == $species_id_check) { - + if ( defined $species_id + && defined $species_id_check + && $species_id == $species_id_check ) + { my ($acc_no_ver,$ver) = split (/\./,$acc); $xref->{ACCESSION} = $acc_no_ver; $xref->{VERSION} = $ver; @@ -144,7 +150,7 @@ sub create_xrefs { } - close (REFSEQ); + $refseq_io->close(); print "Read " . scalar(@xrefs) ." xrefs from $file\n"; @@ -154,14 +160,4 @@ sub create_xrefs { # -------------------------------------------------------------------------------- -sub new { - - my $self = {}; - bless $self, "XrefParser::RefSeqParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm index e1dcec00671462ef457df18fe5a64031b5a73079..96a376aae496f78e6d0fe8c88516e32bc9359ede 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of Refseq records and assign direct xrefs @@ -15,13 +12,13 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(REFSEQ,"<".$file)){ + my $refseq_io = $self->get_filehandle($file); + + if ( defined $refseq_io ) { print "Could not open $file\n"; return 1; } - - # becouse the direct mapping have no descriptions etc # we have to steal these from the previous Refseq parser. @@ -67,8 +64,10 @@ sub run { my $xref_count = 0; my %seen; my %old_to_new; - <REFSEQ>; # header - while(<REFSEQ>){ + + $refseq_io->getline(); # header + + while ( $_ = $refseq_io->getline() ) { chomp; my ($ccds,$refseq) = split; @@ -102,19 +101,12 @@ sub run { } } - print "Parsed $line_count RefSeq_dna identifiers from $file, added $xref_count xrefs and $xref_count direct_xrefs from $line_count lines.\n"; - - close(REFSEQ); - return 0; - -} + $refseq_io->close(); + print "Parsed $line_count RefSeq_dna identifiers from $file, added $xref_count xrefs and $xref_count direct_xrefs from $line_count lines.\n"; -sub new { - my $self = {}; - bless $self, "XrefParser::RefSeq_CCDSParser"; - return $self; + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/SGDParser.pm b/misc-scripts/xref_mapping/XrefParser/SGDParser.pm index d6beb15afd2091a3e85bcc0ffd47340809e228aa..8d1f89f6048b25450932068e035e786935990296 100644 --- a/misc-scripts/xref_mapping/XrefParser/SGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/SGDParser.pm @@ -4,11 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -40,15 +36,17 @@ sub run { - if(!open(SGD,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error + my $sgd_io = $self->get_filehandle($file); + + if ( !defined $sgd_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error } my $xref_count =0; my $syn_count =0; - while (<SGD>) { + while ( $_ = $sgd_io->getline() ) { chomp; my ($locus_name, $alias_name, $desc, $gene_prod, $phenotype, $orf_name, $sgd_id) = split(/\t/,$_); @@ -60,18 +58,11 @@ sub run { $syn_count++; } } + + $sgd_io->close(); + print $xref_count." SGD Xrefs added with $syn_count synonyms\n"; return 0; #successful } - - -sub new { - - my $self = {}; - bless $self, "XrefParser::SGDParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm b/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm index 8d5605cb3b51c568b1827819752daae3b9ce2db2..8ff59a44db6505d279f4d0de31b075a5b865641a 100644 --- a/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm @@ -4,10 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,11 +38,14 @@ sub run { my %name_2_source_id=(); my $added=0; - if(!open(FILE,"<". $file)){ - print "ERROR: Could not open file $file\n"; + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $file\n"; return 1; } - while(my $line = <FILE>){ + + while ( my $line = $file_io->getline() ) { chomp $line; my ($gene_id,$transcript_id,$source_name,$acc,$display_label,$description, $status) = split("\t",$line); @@ -68,19 +68,11 @@ sub run { #the those mapped to the transcript to the genes anyway due to the #biomart check } - close FILE; + + $file_io->close(); print "Added $added Xrefs for Gene segments\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::SegmentParser"; - return $self; - -} - - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm b/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm index 1d6f7e1faf8b9b9de85ac8da9fb5359d18218870..fd0680d935f15da49b1250419e2c3b90af053e4f 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm @@ -6,10 +6,7 @@ use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,7 +38,11 @@ sub run { if(!defined($species_id)){ $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); } -my $xrefs =create_xrefs($unigene_source_id, $unigene_source_id, $file, $species_id); + + my $xrefs = + $self->create_xrefs( $unigene_source_id, $unigene_source_id, $file, + $species_id ); + if(!defined($xrefs)){ return 1; #error } @@ -56,23 +57,24 @@ my $xrefs =create_xrefs($unigene_source_id, $unigene_source_id, $file, $species_ my %geneid_2_desc; sub get_desc{ + my $self = shift; my $file = shift; + my $dir = dirname($file); - (my $name) = $file =~ /\/(\w+)\.seq\.uniq/; print $name."\n"; local $/ = "//"; + my $desc_io = $self->get_filehandle( $dir . '/' . $name . '.data' ); - if(!open (DESC, "$dir/$name.data")){ + if ( !defined $desc_io ) { print "ERROR: Can't open $dir/$name.data\n"; return undef; } - while(<DESC>){ - + while ( $_ = $desc_io->getline() ) { #ID Hs.159356 #TITLE Hypothetical LOC388277 @@ -82,21 +84,27 @@ sub get_desc{ $geneid_2_desc{$id} = $descrip; } + + $desc_io->close(); + return 1; } sub create_xrefs { + my $self = shift; - my ($peptide_source_id, $unigene_source_id, $file, $species_id) = @_; + my ( $peptide_source_id, $unigene_source_id, $file, $species_id ) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); - if(!defined(get_desc($file))){ + if ( !defined( $self->get_desc($file) ) ) { return undef; } - if(!open(UNIGENE, $file)){ + my $unigene_io = $self->get_filehandle($file); + + if ( !defined $unigene_io ) { print "Can't open RefSeq file $file\n"; return undef; } @@ -110,7 +118,7 @@ sub create_xrefs { local $/ = "\n>"; - while (<UNIGENE>) { + while ( $_ = $unigene_io->getline() ) { my $xref; @@ -150,7 +158,8 @@ sub create_xrefs { } - close (UNIGENE); + $unigene_io->close(); + %geneid_2_desc=(); print "Read " . scalar(@xrefs) ." xrefs from $file\n"; @@ -158,16 +167,4 @@ sub create_xrefs { } -# -------------------------------------------------------------------------------- - -sub new { - - my $self = {}; - bless $self, "XrefParser::UniGeneParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm index 97c5dcdf03425e344021fa4d78480c42a1847922..e15ad5a66af43f16facfb4762f96cd2f97338c6e 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm @@ -13,10 +13,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -54,7 +51,10 @@ sub run { print "SpTREMBL source id for $file: $sptr_source_id\n"; - my @xrefs = create_xrefs($sp_source_id, $sptr_source_id, $species_id, $file); + my @xrefs = + $self->create_xrefs( $sp_source_id, $sptr_source_id, $species_id, + $file ); + if ( !@xrefs ) { return 1; # 1 error } @@ -110,8 +110,9 @@ sub get_species { # Parse file into array of xref objects sub create_xrefs { + my $self = shift; - my ($sp_source_id, $sptr_source_id, $species_id, $file) = @_; + my ( $sp_source_id, $sptr_source_id, $species_id, $file ) = @_; my $num_sp = 0; my $num_sptr = 0; @@ -135,15 +136,14 @@ sub create_xrefs { my (%genemap) = %{XrefParser::BaseParser->get_valid_codes("mim_gene",$species_id)}; my (%morbidmap) = %{XrefParser::BaseParser->get_valid_codes("mim_morbid",$species_id)}; - if(!open(UNIPROT, $file)){ - print"Can't open Swissprot file $file\n"; - return undef; - } + my $uniprot_io = $self->get_filehandle($file); + if ( !defined $uniprot_io ) { return undef } + my @xrefs; - local $/ = "\/\/\n"; + local $/ = "//\n"; - while (<UNIPROT>) { + while ( $_ = $uniprot_io->getline() ) { # if an OX line exists, only store the xref if the taxonomy ID that the OX # line refers to is in the species table @@ -353,7 +353,7 @@ sub create_xrefs { } - close (UNIPROT); + $uniprot_io->close(); print "Read $num_sp SwissProt xrefs and $num_sptr SPTrEMBL xrefs from $file\n"; print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if ($num_sp_pred > 0 || $num_sptr_pred > 0); @@ -363,16 +363,4 @@ sub create_xrefs { #TODO - currently include records from other species - filter on OX line?? } -# -------------------------------------------------------------------------------- - -sub new { - - my $self = {}; - bless $self, "XrefParser::UniProtParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm index 0486f04a67f25085cc457dca1419fcb3555b42b2..9001395c4939a591c76c42a7b5ab5a9492a5900d 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm @@ -5,10 +5,7 @@ package XrefParser::UniProtVarSplicParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # UniProtVarSplic file format: fasta, e.g. @@ -27,17 +24,18 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 error + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 error } my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)}; my $missed = 0; - while (<FILE>) { - + while ( $_ = $file_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -69,7 +67,7 @@ sub run { } } - close (FILE); + $file_io->close(); print $missed." ignored as original uniprot not found in database\n"; print scalar(@xrefs) . " UniProtVarSplic xrefs succesfully parsed\n"; @@ -80,15 +78,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::UniProtVarSplicParser"; - return $self; - -} - 1; - - diff --git a/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm b/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm index fcd0eda3da6d247a717c57fd98b5f7f6f791e64f..df059b421c7397528fe472262845993760d4fa86 100644 --- a/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm @@ -2,16 +2,12 @@ package XrefParser::WilsonAffyParser; use strict; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); my $xref_sth ; my $dep_sth; my $syn_sth; - - sub run { my ($self, $file, $source_id, $species_id) = @_; @@ -39,14 +35,16 @@ sub create_xrefs { my @xrefs; - if(!open(FILE,"<".$file)){ + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } - <FILE>; # skip first line - while (<FILE>) { + $file_io->getline(); # skip first line + while ( $_ = $file_io->getline() ) { #last if ($count > 200); my $xref; @@ -79,10 +77,13 @@ sub create_xrefs { # fetch sequence for others (EMBL ESTs and RefSeqs - pfetch will handle these) system ("pfetch -q $target > seq.txt"); - open(SEQ, "<seq.txt"); - my $seq = <SEQ>; + + my $seq_io = $self->get_filehandle('seq.txt'); + + my $seq = $seq_io->getline(); + $seq_io->close(); + chomp($seq); - close(SEQ); if ($seq && $seq !~ /no match/) { @@ -116,7 +117,7 @@ sub create_xrefs { } - close(FILE); + $file_io->close(); print "\n\nParsed $count primary xrefs.\n"; print "Couldn't get sequence for $noseq primary_xrefs\n" if ($noseq); @@ -126,12 +127,4 @@ sub create_xrefs { } -sub new { - - my $self = {}; - bless $self, "XrefParser::WilsonAffyParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm b/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm index c31253be7d0d800d58765b8023a020254d11febf..8f27dbfb86498248455b28ec487185a5b7b8e5b7 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm @@ -3,10 +3,8 @@ package XrefParser::WormPepParser; use strict; use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); my $xref_sth ; my $dep_sth; @@ -31,15 +29,17 @@ sub run { my $xref_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$worm_source_id AND species_id=$species_id"); my $xref_sth2 = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$worm_locus_id AND species_id=$species_id"); - if(!open(PEP,"<".$file)){ + my $pep_io = $self->get_filehandle($file); + + if ( !defined $pep_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } - my ($x_count, $d_count); + my ($x_count, $d_count); - while (<PEP>) { + while ( $_ = $pep_io->getline() ) { my ($transcript, $wb, $display) = (split(/\t/,substr($_,1)))[0,1,2]; # reuse or create xref @@ -69,19 +69,10 @@ sub run { $d_count++; } - close (PEP); + $pep_io->close(); print "Added $d_count direct xrefs and $x_count xrefs\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::WormPepParser"; - return $self; - -} - 1; - diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm index c18fa623664a122b55302f20e004989b7877dc7a..92d43cabed1b1ff1730a8079939cd5eb61316087 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm @@ -7,10 +7,7 @@ package XrefParser::WormbaseDatabaseStableIDParser; use strict; -use XrefParser::DatabaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::DatabaseParser); +use base qw( XrefParser::DatabaseParser ); sub run { @@ -46,13 +43,5 @@ sub run { return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::WormbaseDatabaseStableIDParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm b/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm index 452855aea873f28851b20e72c02cfe8536488269..02c1e7591f2c2abfab11efd910fa23ee74d7d239 100644 --- a/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm @@ -5,10 +5,7 @@ package XrefParser::XenopusJamboreeParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Xenopus Jamboree peptides file format: fasta, e.g. @@ -24,14 +21,16 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ + my $file_io = $self->getline($file); + + if ( !defined $file_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } - my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); - while (<FILE>) { + my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); + while ( $_ = $file_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -57,7 +56,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " XenopusJamboreeParser xrefs succesfully parsed\n"; @@ -69,13 +68,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::XenopusJamboreeParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm b/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm index 4875998fe4672c9cf3affcefa2c4da3a46e58b4f..83ceda1bcf863de0e29543ffd99dcc03e9c48025 100644 --- a/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm @@ -4,12 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -45,10 +40,13 @@ sub run { my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)}; my (%refseq) = %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)}; - if(!open(SWISSPROT,"<".$dir."/swissprot.txt")){ - print "ERROR: Could not open $dir/swissprot.txt\n"; - return 1; # 1 error + my $swissprot_io = $self->get_filehandle( $dir . '/swissprot.txt' ); + + if ( !defined $swissprot_io ) { + print "ERROR: Could not open $dir/swissprot.txt\n"; + return 1; # 1 error } + #e.g. #ZDB-GENE-000112-30 couptf2 O42532 #ZDB-GENE-000112-32 couptf3 O42533 @@ -58,7 +56,8 @@ sub run { my $spcount =0; my $rscount =0; my $mismatch=0; - while (<SWISSPROT>) { + + while ( $_ = $swissprot_io->getline() ) { chomp; my ($zfin, $label, $acc) = split (/\s+/,$_); if(defined($swiss{$acc})){ @@ -69,16 +68,21 @@ sub run { $mismatch++; } } - close SWISSPROT; - - if(!open(REFSEQ,"<".$dir."/refseq.txt")){ - print "ERROR: Could not open $dir/refseq.txt\n"; + + $swissprot_io->close(); + + my $refseq_io = $self->get_filehandle( $dir . '/refseq.txt' ); + + if ( !defined $refseq_io ) { + print "ERROR: Could not open $dir/refseq.txt\n"; return 1; } + #ZDB-GENE-000125-12 igfbp2 NM_131458 #ZDB-GENE-000125-12 igfbp2 NP_571533 #ZDB-GENE-000125-4 dlc NP_571019 - while (<REFSEQ>) { + + while ( $_ = $refseq_io->getline() ) { chomp; my ($zfin, $label, $acc) = split (/\s+/,$_); if(defined($refseq{$acc})){ @@ -89,18 +93,12 @@ sub run { $mismatch++; } } - close REFSEQ; + + $refseq_io->close(); + print "\t$spcount xrefs from Swissprot and $rscount xrefs from RefSeq succesfully loaded\n"; print "\t$mismatch xrefs ignored\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::ZFINParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm b/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm index 92710ba9ccd6fede77331814dbbf2aebd1ee1452..8ca57774fbb6a66d9f91bc75fd118613e3cfca04 100644 --- a/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm @@ -4,10 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,11 +38,14 @@ sub run { my %name_2_source_id=(); my $added=0; - if(!open(FILE,"<". $file)){ - print "ERROR: Could not open file $file\n"; + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $file\n"; return 1; } - while(my $line = <FILE>){ + + while ( my $line = $file_io->getline() ) { chomp $line; my ($gene_id,$transcript_id,$source_name,$acc,$display_label,$full_description, $status) = split("\t",$line); @@ -79,19 +79,11 @@ sub run { #biomart check # $self->add_direct_xref($xref_id, $gene_id, "Gene", "") if (defined($gene_id)); } - close FILE; + + $file_io->close(); print "Added $added Xrefs for ncRNAs\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::ncRNAParser"; - return $self; - -} - - 1; diff --git a/misc-scripts/xref_mapping/xref_parser.pl b/misc-scripts/xref_mapping/xref_parser.pl index 5d401928e184a9239f9463b7ef11e9322d03dd55..2fe667106f8666bdd0d568e3d1bc039328e3f1f6 100644 --- a/misc-scripts/xref_mapping/xref_parser.pl +++ b/misc-scripts/xref_mapping/xref_parser.pl @@ -3,36 +3,45 @@ use strict; use Getopt::Long; use XrefParser::BaseParser; -my ($host, $port, $dbname, $user, $pass, @species, @sources, $skipdownload, $checkdownload, $create, $release, $cleanup, $drop_existing_db, $deletedownloaded, $dl_path, @notsource); - -GetOptions('dbuser|user=s' => \$user, - 'dbpass|pass=s' => \$pass, - 'dbhost|host=s' => \$host, - 'dbport|port=i' => \$port, - 'dbname=s' => \$dbname, - 'species=s' => \@species, - 'source=s' => \@sources, - 'download_dir=s' => \$dl_path, - 'skipdownload' => \$skipdownload, # skips all downloads - 'checkdownload!' => \$checkdownload, # if file exists it won't be downloaded - 'create' => \$create, - 'setrelease=s' => \$release, - 'cleanup' => \$cleanup, - 'notsource=s' => \@notsource, - 'drop_db|dropdb!' => \$drop_existing_db, # drops xref db without user interaction - 'delete_downloaded' => \$deletedownloaded, - 'download_path=s' => \$dl_path, - 'help' => sub { usage(); exit(0); }); +my ( + $host, $port, $dbname, + $user, $pass, @species, + @sources, $skipdownload, $checkdownload, + $create, $release, $cleanup, + $drop_existing_db, $deletedownloaded, $dl_path, + @notsource, $compressed +); + +GetOptions( + 'dbuser|user=s' => \$user, + 'dbpass|pass=s' => \$pass, + 'dbhost|host=s' => \$host, + 'dbport|port=i' => \$port, + 'dbname=s' => \$dbname, + 'species=s' => \@species, + 'source=s' => \@sources, + 'download_dir=s' => \$dl_path, + 'skipdownload' => \$skipdownload, # skips all downloads + 'checkdownload!' => \$checkdownload, # don't download if exists + 'create' => \$create, + 'setrelease=s' => \$release, + 'cleanup' => \$cleanup, + 'notsource=s' => \@notsource, + 'drop_db|dropdb!' => + \$drop_existing_db, # drops xref db without user interaction + 'delete_downloaded' => \$deletedownloaded, + 'download_path=s' => \$dl_path, + 'compressed' => \$compressed, # don't force decompression of files + 'help' => sub { usage(); exit(0); } +); @species = split(/,/,join(',',@species)); @sources = split(/,/,join(',',@sources)); -if (!$user || !$host || !$dbname) { - - usage(); - exit(1); - +if ( !$user || !$host || !$dbname ) { + usage(); + exit(1); } XrefParser::BaseParser::run( @@ -43,7 +52,8 @@ XrefParser::BaseParser::run( $checkdownload, $create, $release, $cleanup, $drop_existing_db, $deletedownloaded, - $dl_path, \@notsource + $dl_path, \@notsource, + $compressed ); # --------------------------------------------------------------------------------