From 2b13f775c9f4c7f666a5d69191d2fb30bcb32341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kusalananda=20K=C3=A4h=C3=A4ri?= <ak4@sanger.ac.uk> Date: Fri, 23 Feb 2007 15:09:23 +0000 Subject: [PATCH] Allow the Xref parsers to read gzip compressed files through a zcat pipe. Use the command line switch -compressed with xref_parser.pl to do this. The default behaviour (without the -compressed switch) is still to uncompress gzipped files. --- .../XrefParser/AedesGenBankParser.pm | 26 ++-- .../xref_mapping/XrefParser/AgilentParser.pm | 27 ++--- .../XrefParser/AnophelesSymbolParser.pm | 26 ++-- .../xref_mapping/XrefParser/BaseParser.pm | 113 +++++++++++++----- .../xref_mapping/XrefParser/CCDSParser.pm | 28 ++--- .../xref_mapping/XrefParser/CeleraParser.pm | 22 +--- .../XrefParser/CeleraProteinParser.pm | 21 +--- .../XrefParser/CeleraTranscriptParser.pm | 21 +--- .../xref_mapping/XrefParser/CodelinkParser.pm | 24 ++-- .../xref_mapping/XrefParser/DatabaseParser.pm | 12 +- .../XrefParser/EntrezGeneParser.pm | 31 ++--- .../xref_mapping/XrefParser/FastaParser.pm | 14 +-- .../XrefParser/Flybase_dmel_GFFv3_Parser.pm | 23 ++-- .../xref_mapping/XrefParser/GOParser.pm | 27 ++--- .../xref_mapping/XrefParser/HUGOParser.pm | 25 ++-- .../XrefParser/HUGO_CCDSParser.pm | 24 ++-- .../XrefParser/HUGO_ENSGParser.pm | 26 ++-- .../xref_mapping/XrefParser/IPIParser.pm | 26 ++-- .../xref_mapping/XrefParser/IlluminaParser.pm | 23 +--- .../xref_mapping/XrefParser/InterproParser.pm | 31 ++--- .../xref_mapping/XrefParser/JGI_Parser.pm | 31 ++--- .../XrefParser/JGI_ProteinParser.pm | 13 +- .../xref_mapping/XrefParser/MGDParser.pm | 44 +++---- .../xref_mapping/XrefParser/MIMParser.pm | 36 +++--- .../xref_mapping/XrefParser/OTTTParser.pm | 26 ++-- .../xref_mapping/XrefParser/RGDParser.pm | 27 ++--- .../XrefParser/RefSeqGPFFParser.pm | 32 ++--- .../xref_mapping/XrefParser/RefSeqParser.pm | 48 ++++---- .../XrefParser/RefSeq_CCDSParser.pm | 30 ++--- .../xref_mapping/XrefParser/SGDParser.pm | 29 ++--- .../xref_mapping/XrefParser/SegmentParser.pm | 26 ++-- .../xref_mapping/XrefParser/UniGeneParser.pm | 49 ++++---- .../xref_mapping/XrefParser/UniProtParser.pm | 38 ++---- .../XrefParser/UniProtVarSplicParser.pm | 29 ++--- .../XrefParser/WilsonAffyParser.pm | 35 +++--- .../xref_mapping/XrefParser/WormPepParser.pm | 25 ++-- .../WormbaseDatabaseStableIDParser.pm | 13 +- .../XrefParser/XenopusJamboreeParser.pm | 26 ++-- .../xref_mapping/XrefParser/ZFINParser.pm | 46 ++++--- .../xref_mapping/XrefParser/ncRNAParser.pm | 26 ++-- misc-scripts/xref_mapping/xref_parser.pl | 62 ++++++---- 41 files changed, 508 insertions(+), 753 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm b/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm index 5769b8da38..7fb3838057 100644 --- a/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm @@ -3,10 +3,7 @@ package XrefParser::AedesGenBankParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); #Aedes GenBank protein - because not yet in UniProt #>EAT48991.1 @@ -25,11 +22,13 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $file_io = $self->get_filehandle($file); + if ( !defined $file_io ) { + print "Could not open $file\n"; + return 1; } - while (<FILE>) { + + while ( $_ = $file_io->getline() ) { my $xref; @@ -73,7 +72,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " AedesGenBank xrefs succesfully parsed\n"; @@ -83,13 +82,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::AedesGenBankParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm b/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm index 0817341726..a9a37f555c 100644 --- a/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm @@ -3,10 +3,7 @@ package XrefParser::AgilentParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # OParser for FASTA-format probe mappings from Agilent # >A_23_P253586 @@ -22,12 +19,15 @@ sub run { # local $/ = "\n>"; - if(!open(AG,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $ag_io = $self->get_filehandle($file); + + if ( !defined $ag_io ) { + print "Could not open $file\n"; + return 1; } + my $probe; - while (<AG>) { + while ( $_ = $ag_io->getline() ) { chomp; @@ -57,7 +57,7 @@ sub run { } } - close(AG); + $ag_io->close(); print scalar(@xrefs) . " Agilent xrefs succesfully parsed\n"; @@ -67,13 +67,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::AgilentParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm b/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm index eaf5a838e7..641b724dfb 100644 --- a/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm @@ -3,10 +3,7 @@ package XrefParser::AnophelesSymbolParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # AnophelesSymbol database dump for anopheles - FASTA format # @@ -25,12 +22,14 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "Could not open $file\n"; + return 1; } - while (<FILE>) { + while ( $_ = $file_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -55,7 +54,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " AnophelesSymbol xrefs succesfully parsed\n"; @@ -65,13 +64,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::AnophelesSymbolParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/BaseParser.pm b/misc-scripts/xref_mapping/XrefParser/BaseParser.pm index 9b368624af..ebcaf7246b 100644 --- a/misc-scripts/xref_mapping/XrefParser/BaseParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/BaseParser.pm @@ -27,16 +27,27 @@ my %dependent_sources; my %taxonomy2species_id; my %name2species_id; -my ($host, $port, $dbname, $user, $pass, $create, $release, $cleanup, $deletedownloaded); -my ($skipdownload,$drop_db,$checkdownload, $dl_path) ; +my ( + $host, $port, $dbname, + $user, $pass, $create, + $release, $cleanup, $deletedownloaded, + $skipdownload, $drop_db, $checkdownload, + $dl_path, $compressed +); # -------------------------------------------------------------------------------- # Get info about files to be parsed from the database -sub run { - - ($host, $port, $dbname, $user, $pass, my $speciesr, my $sourcesr, $skipdownload, $checkdownload, - $create, $release, $cleanup, $drop_db, $deletedownloaded, $dl_path, my $notsourcesr) = @_; +sub run +{ + ( + $host, $port, $dbname, + $user, $pass, my $speciesr, + my $sourcesr, $skipdownload, $checkdownload, + $create, $release, $cleanup, + $drop_db, $deletedownloaded, $dl_path, + my $notsourcesr, $compressed + ) = @_; $base_dir = $dl_path if $dl_path; @@ -195,8 +206,8 @@ sub run { if ($checkdownload) { my $check_file = $dir . '/' . $file; - $check_file =~ s/\.gz$//; - $check_file =~ s/\.Z$//; + + if ( !$compressed ) { $check_file =~ s/\.(gz|Z)$// } print "Checking for file '$check_file'\n"; @@ -206,8 +217,7 @@ sub run { $skipdownload = 1; - $file =~ s/\.gz$//; - $file =~ s/\.Z$//; + if ( !$compressed ) { $file =~ s/\.(gz|Z)$// } } else { print "File '$check_file' does not exist.\n" . "Scheduling '$dir/$file' for download...\n"; @@ -265,22 +275,27 @@ sub run { croak("Could not get $type file $file tried 5 times but failed"); } - # if the file is compressed, the FTP server may or may not have automatically uncompressed it - # TODO - read .gz file directly? open (FILE, "zcat $file|") or Compress::Zlib - if ($file =~ /(.*)\.gz$/ or $file =~ /(.*)\.Z$/) { - print "Uncompressing $dir/$file\n"; - system("gunzip -f $dir/$file"); - $file = $1; - } - if ($file =~ /(.*)\.zip$/) { - print "Unzipping $dir/$file\n"; - system("unzip -o -q -d $dir $dir/$file"); - } + # If the file is compressed, the FTP server may or may not have + # automatically uncompressed it (it shouldn't have, is this an + # historical artifact? (ak)). + + if ( !$compressed && ( $file =~ /\.(gz|Z)$/ ) ) { + print "Uncompressing '$dir/$file' using 'gunzip'\n"; + system( "gunzip", "-f", $dir . '/' . $file ); + } + if ( $file =~ /(.*)\.zip$/ ) { + print "Uncompressing '$dir/$file' using 'unzip'\n"; + system( "unzip", "-o", "-q", "-d", $dir, + $dir . '/' . $file ); + } } - $file =~s/\.gz$//; # if skipdownload set this will not have been done yet. - $file=~s/\.Z$//; # if it has no harm done + if ( !$compressed ) { + $file =~ s/\.(gz|Z)$//; # If skipdownload set this will + # not have been done yet. + # If it has, no harm done + } if ($file_from_archive) { push @new_file, $file_from_archive; @@ -361,13 +376,57 @@ sub run { # -------------------------------------------------------------------------------- -sub new { +# Given a file name, returns a IO::Handle object. If the file is +# gzipped, the handle will be to an unseekable stream coming out of a +# zcat pipe. If the given file name doesn't correspond to an existing +# file, the routine will try to add '.gz' to the file name or to remove +# any .'Z' or '.gz' and try again. Returns undef on failure and will +# write a warning to stderr. + +sub get_filehandle +{ + my ($self, $file_name) = @_; + + my $io; + + my $alt_file_name = $file_name; + $alt_file_name =~ s/\.(gz|Z)$//; - my $self = {}; - bless $self, "BaseParser"; + if ( $alt_file_name eq $file_name ) { + $alt_file_name .= '.gz'; + } + + if ( !-f $file_name ) { + carp( "File '$file_name' does not exist, " + . "will try '$alt_file_name'" ); + $file_name = $alt_file_name; + } + + if ( $file_name =~ /\.(gz|Z)$/ ) { + # Read from zcat pipe + $io = IO::File->new("zcat $file_name |") + or carp("Can not open file '$file_name' with 'zcat'"); + } else { + # Read file normally + $io = IO::File->new($file_name) + or carp("Can not open file '$file_name'"); + } + + if ( !defined $io ) { return undef } - return $self; + print "Reading from '$file_name'...\n"; + + return $io; +} + +# -------------------------------------------------------------------------------- + +sub new +{ + my ($proto) = @_; + my $class = ref $proto || $proto; + return bless {}, $class; } # -------------------------------------------------------------------------------- diff --git a/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm b/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm index 877a5995a3..e5fdc637b6 100644 --- a/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CCDSParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of CCDS records and assign direct xrefs # All assumed to be linked to transcripts @@ -18,17 +15,19 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(CCDS,"<".$file)){ - print "Could not open $file\n"; - return 1; + my $ccds_io = $self->get_filehandle($file); + + if ( !defined $ccds_io ) { + print "Could not open $file\n"; + return 1; } + my $line_count = 0; my $xref_count = 0; my $xref_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND version=? AND source_id=$source_id AND species_id=$species_id"); - while (<CCDS>) { - + while ( $_ = $ccds_io->getline() ) { my ($stable_id, $ccds) = split; my ($acc, $version) = split (/\./, $ccds); @@ -48,17 +47,8 @@ sub run { print "Parsed $line_count CCDS identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n"; - close(CCDS); + $ccds_io->close(); return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::CCDSParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm b/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm index ab51ee815d..0d65429397 100644 --- a/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm @@ -3,10 +3,7 @@ package XrefParser::CeleraParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Celera database dump for anopheles - FASTA format # @@ -27,12 +24,14 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { print "Could not open $file\n"; return 1; } - while (<FILE>) { + while ( $_ = $file_io->getline() ) { next if (/^File:/); # skip header my $xref; @@ -69,7 +68,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " Celera xrefs succesfully parsed\n"; @@ -79,13 +78,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::CeleraParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm b/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm index 1325537fb7..a51cc21403 100644 --- a/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CeleraProteinParser.pm @@ -2,26 +2,13 @@ package XrefParser::CeleraProteinParser; use strict; -use XrefParser::CeleraParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::CeleraParser); +use base qw( XrefParser::CeleraParser ); # See CeleraParser for details -sub get_sequence_type() { - - return 'peptide'; - -} - - -sub new { - - my $self = {}; - bless $self, "XrefParser::CeleraProteinParser"; - return $self; - +sub get_sequence_type() +{ + return 'peptide'; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm b/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm index 99a2e8de6e..fb08e9f140 100644 --- a/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CeleraTranscriptParser.pm @@ -2,26 +2,13 @@ package XrefParser::CeleraTranscriptParser; use strict; -use XrefParser::CeleraParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::CeleraParser); +use base qw( XrefParser::CeleraParser ); # See CeleraParser for details -sub get_sequence_type() { - - return 'dna'; - -} - - -sub new { - - my $self = {}; - bless $self, "XrefParser::CeleraTranscriptParser"; - return $self; - +sub get_sequence_type() +{ + return 'dna'; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm b/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm index 433ce2e043..061ad01da6 100644 --- a/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm @@ -3,10 +3,7 @@ package XrefParser::CodelinkParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parser for Codelink probes @@ -23,13 +20,13 @@ sub run { local $/ = "\n>"; - if(!open(CODELINK,"<".$file)){ + my $codelink_io = $self->get_filehandle($file); + if ( !defined $codelink_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 = error + return 1; # 1 = error } - while (<CODELINK>) { - + while ( $_ = $codelink_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -53,6 +50,8 @@ sub run { } + $codelink_io->close(); + print scalar(@xrefs) . " Codelink xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); @@ -61,13 +60,4 @@ sub run { return 0; #successful } - -sub new { - - my $self = {}; - bless $self, "XrefParser::CodelinkParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm b/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm index a63494a011..ac72f18c90 100644 --- a/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/DatabaseParser.pm @@ -3,10 +3,8 @@ package XrefParser::DatabaseParser; use strict; use DBI; -use XrefParser::BaseParser; -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Base class for parsers that parse from databases rather than files @@ -58,13 +56,5 @@ sub db { } -sub new { - - my $self = {}; - bless $self, "XrefParser::DatabaseParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm b/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm index c17780f382..750e46e1d1 100644 --- a/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm @@ -4,11 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,14 +37,15 @@ sub run { my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); - if(!open(EG,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error - } + my $eg_io = $self->get_filehandle($file); + if ( !defined $eg_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error + } - my $head = <EG>; # first record are the headers + my $head = $eg_io->getline(); # first record are the headers chomp $head; my (@arr) = split(/\s+/,$head); # process this to the correct indexes to use. (incase they change); @@ -92,7 +89,7 @@ sub run { } my $xref_count = 0; my $syn_count = 0; - while (<EG>) { + while ( $_ = $eg_io->getline() ) { chomp; my (@arr) = split(/\t/,$_); if($arr[$gene_tax_id_index] != $species_tax_id){ @@ -110,18 +107,12 @@ sub run { $syn_count++; } } + + $eg_io->close(); + print $xref_count." EntrezGene Xrefs added with $syn_count synonyms\n"; return 0; #successful } - - -sub new { - - my $self = {}; - bless $self, "XrefParser::EntrezGeneParser"; - return $self; - -} 1; diff --git a/misc-scripts/xref_mapping/XrefParser/FastaParser.pm b/misc-scripts/xref_mapping/XrefParser/FastaParser.pm index 44da5797de..ebde3171f0 100644 --- a/misc-scripts/xref_mapping/XrefParser/FastaParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/FastaParser.pm @@ -4,10 +4,7 @@ use strict; use Bio::SeqIO; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Fasta file format, e.g. # >foo peptide sequence for the foo gene @@ -55,13 +52,4 @@ sub run { return 0; #successful } - -sub new { - - my $self = {}; - bless $self, "XrefParser::FastaParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm index 653589e35a..31db32db96 100644 --- a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm @@ -10,14 +10,12 @@ package XrefParser::Flybase_dmel_GFFv3_Parser; use strict; + use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; -use vars qw(@ISA); use Bio::EnsEMBL::Utils::Exception; - -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); my %cache_source =(); @@ -40,9 +38,8 @@ if (!defined(caller())) { # -------------------------------------------------------------------------------- sub new { - my ($class,@args) = @_; - my $self={}; - bless $self,$class; + my $proto = shift; + my $self = $proto->SUPER::new(@_); $self->external_source_db_name('flybase_gff'); @@ -165,11 +162,15 @@ sub create_xrefs { my ($self, $flybase_source_id, $file) = @_; print STDERR "starting to parse $file...." ; - if(!open(GFF, $file)){ + + my $gff_io = $self->get_filehandle($file); + + if ( !defined $gff_io ) { print "ERROR: Can't open the GFF file $file\n"; return 0; } - while (<GFF>) { + + while ( $_ = $gff_io->getline() ) { chomp; my @col = split /\s+/; if($col[3]){ @@ -202,7 +203,9 @@ sub create_xrefs { } } - close (GFF); + + $gff_io->close(); + return 1; } diff --git a/misc-scripts/xref_mapping/XrefParser/GOParser.pm b/misc-scripts/xref_mapping/XrefParser/GOParser.pm index 00869d3d6f..eac9d917f3 100644 --- a/misc-scripts/xref_mapping/XrefParser/GOParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/GOParser.pm @@ -6,12 +6,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -63,13 +58,16 @@ sub run { my $count = 0; - if(!open(GO,"<".$file)){ + my $go_io = $self->get_filehandle($file); + + if ( !defined $go_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } + my $taxon_line = "taxon:".$species_id; my $miss =0; - while (<GO>) { + while ( $_ = $go_io->getline() ) { if(/$taxon_line/){ chomp; my @array = split (/\t/,$_); @@ -168,16 +166,11 @@ sub run { } } } + + $go_io->close(); + print "\t$count GO dependent xrefs added $refseq_miss refseq not found and $swiss_miss Swissprot not found \n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::GOParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm b/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm index 1ff87fa540..c207105f6f 100644 --- a/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HUGOParser.pm @@ -3,10 +3,8 @@ package XrefParser::HUGOParser; use strict; use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); my $xref_sth ; my $dep_sth; my $syn_sth; @@ -67,18 +65,20 @@ sub run { my $entrezgene_count = 0; my $mismatch = 0; - if(!open (HUGO, "<$file")){ - print "ERROR: Can't open HUGO file $file\n"; + my $hugo_io = $self->get_filehandle($file); + + if ( !defined $hugo_io ) { + print "ERROR: Can't open HUGO file $file\n"; return 1; } - <HUGO>; + $_ = $hugo_io->getline(); #23 ABAT 4-aminobutyrate aminotransferase P80404 #29 ABCA1 ATP-binding cassette, sub-family A (ABC1), member 1 ABC1, HDLDT1 O95477 #40 ABCB1 ATP-binding cassette, sub-family B (MDR/TAP), member 1 PGY1, MDR1, CLCS P-gp, CD243, GP170, ABC20 P08183 NM_000927 - while (<HUGO>) { + while ( $_ = $hugo_io->getline() ) { chomp; @@ -168,7 +168,7 @@ sub run { } # while HUGO - close (HUGO); + $hugo_io->getline(); print "Loaded a total of " . ($swiss_count + $refseq_count + $entrezgene_count) . " HUGO xrefs, $refseq_count from RefSeq curated mappings and $swiss_count from Uniprot (mapped) and $entrezgene_count from EntrezGene mappings\n"; @@ -182,15 +182,6 @@ sub rename_url_file{ return "hugo.txt"; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::HUGOParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm b/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm index 394cf49af5..ca172bf50f 100644 --- a/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HUGO_CCDSParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of HGNC records and assign direct xrefs # All assumed to be linked to genes @@ -16,7 +13,9 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(HUGO,"<".$file)){ + my $hugo_io = $self->get_filehandle($file); + + if ( !defined $hugo_io ) { print "Could not open $file\n"; return 1; } @@ -68,7 +67,8 @@ sub run { my %seen; my $ignore_count = 0; my $ignore_examples =""; - while(<HUGO>){ + + while( $_ = $hugo_io->getline() ) { chomp; my ($ccds,$hgnc) = split; @@ -96,17 +96,9 @@ sub run { print $ignore_count." ignoreed due to numbers no identifiers being no longer valid :- $ignore_examples\n"; } - close(HUGO); - return 0; + $hugo_io->close(); -} - - -sub new { - - my $self = {}; - bless $self, "XrefParser::HUGO_CCDSParser"; - return $self; + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm b/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm index af972d8015..664904ff7b 100644 --- a/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HUGO_ENSGParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of HGNC records and assign direct xrefs # All assumed to be linked to genes @@ -16,15 +13,16 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(HUGO,"<".$file)){ + my $hugo_io = $self->get_filehandle($file); + + if ( !defined $hugo_io ) { print "Could not open $file\n"; return 1; } + my $line_count = 0; my $xref_count = 0; - - # becouse the direct mapping have no descriptions etc # we have to steal these fromt he previous HUGO parser. # This is why the order states this is after the other one. @@ -51,7 +49,8 @@ sub run { my $ignore_count = 0; my $ignore_examples =""; my %acc; - while (<HUGO>) { + + while ( $_ = $hugo_io->getline() ) { my ($hgnc, $stable_id) = split; @@ -79,17 +78,10 @@ sub run { if($ignore_count){ print $ignore_count." ignoreed due to numbers no identifiers being no longer valid :- $ignore_examples\n"; } - close(HUGO); - return 0; -} - -sub new { - - my $self = {}; - bless $self, "XrefParser::HUGO_ENSGParser"; - return $self; + $hugo_io->close(); + return 0; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/IPIParser.pm b/misc-scripts/xref_mapping/XrefParser/IPIParser.pm index e37d31b080..0f62360f24 100644 --- a/misc-scripts/xref_mapping/XrefParser/IPIParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/IPIParser.pm @@ -3,10 +3,7 @@ package XrefParser::IPIParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # IPI file format: fasta, e.g. # >IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q15104|REFSEQ_NP:NP_002515|ENSEMBL:ENSP00000261444 Tax_Id=9606 Transforming protein N-Ras @@ -22,14 +19,16 @@ sub run { local $/ = "\n>"; - if(!open(IPI,"<".$file)){ + my $ipi_io = $self->get_filehandle($file); + + if ( !defined $ipi_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 = error + return 1; # 1 = error } - my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); - while (<IPI>) { + my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); + while ( $_ = $ipi_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -64,6 +63,8 @@ sub run { } + $ipi_io->close(); + print scalar(@xrefs) . " IPI xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); @@ -72,13 +73,4 @@ sub run { return 0; #successful } - -sub new { - - my $self = {}; - bless $self, "XrefParser::IPIParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm b/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm index 10be9f5d07..d959b579b0 100644 --- a/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm @@ -2,10 +2,7 @@ package XrefParser::IlluminaParser; use strict; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parser for Illumina V2 xrefs - V1 are done by the vanilla FastaParser @@ -21,13 +18,14 @@ sub run { my @xrefs; - if(!open(FILE,"<".$file)){ + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { print "Could not open $file\n"; return 1; } - while (<FILE>) { - + while ( $_ = $file_io->getline() ) { chomp; my $xref; @@ -63,7 +61,7 @@ sub run { } - close(FILE); + $file_io->close(); print scalar(@xrefs) . " Illumina V2 xrefs succesfully parsed\n"; @@ -73,13 +71,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::IlluminaParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/InterproParser.pm b/misc-scripts/xref_mapping/XrefParser/InterproParser.pm index 31e7706bf6..defaf5fd35 100644 --- a/misc-scripts/xref_mapping/XrefParser/InterproParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/InterproParser.pm @@ -4,11 +4,8 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - my $xref_sth ; my $dep_sth; @@ -57,15 +54,18 @@ sub run { my $dir = dirname($file); - + my %short_name; my %description; my %pfam; - - if(!open (XML, $dir."/interpro.xml")){ + + my $xml_io = $self->get_filehandle( $dir . "/interpro.xml" ); + + if ( !defined $xml_io ) { print "ERROR: Can't open hugo interpro file $dir/interpro.xml\n"; - return 1; # 1= error + return 1; # 1= error } + #<interpro id="IPR001023" type="Family" short_name="Hsp70" protein_count="1556"> # <name>Heat shock protein Hsp70</name> # <db_xref protein_count="18" db="PFAM" dbkey="PF01278" name="Omptin" /> @@ -77,8 +77,8 @@ sub run { my $last = ""; my $i =0; - while (<XML>) { + while ( $_ = $xml_io->getline() ) { my $interpro; my $short_name; my $name; @@ -109,7 +109,8 @@ sub run { } } - close (LONG); + $xml_io->close(); + for my $db ( keys %count ) { print "\t".$count{$db}." $db loaded.\n"; } @@ -126,14 +127,4 @@ sub get_xref{ return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::InterproParser"; - return $self; - -} - 1; - - diff --git a/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm b/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm index 58377e3011..4cb15cd0a2 100644 --- a/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm @@ -3,10 +3,7 @@ package XrefParser::JGI_Parser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # JGI protein file with gene predictons - FASTA FORMAT # @@ -50,11 +47,14 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error } - while (<FILE>) { + + while ( $_ = $file_io->getline() ) { next if (/^File:/); # skip header @@ -122,7 +122,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " JGI_ xrefs succesfully parsed\n"; @@ -133,11 +133,14 @@ sub run { } -sub new { - my $self = {}; - bless $self, "XrefParser::JGI_Parser"; - print "\n\nh ave new jp\n" ; - return $self; +sub new +{ + my $proto = shift; + my $self = $proto->SUPER::new(@_); + + print "\n\nhave new jp\n"; + + return $self; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm b/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm index 218e8fe9dc..90afb658b7 100644 --- a/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/JGI_ProteinParser.pm @@ -3,21 +3,12 @@ package XrefParser::JGI_ProteinParser; use strict; -use XrefParser::JGI_Parser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::JGI_Parser); +use base qw( XrefParser::JGI_Parser ); # See JGI_Parser for details + sub get_sequence_type() { return 'peptide'; } - -sub new { - my $self = {}; - bless $self, "XrefParser::JGI_ProteinParser"; - return $self; -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/MGDParser.pm b/misc-scripts/xref_mapping/XrefParser/MGDParser.pm index c5bb48cb42..85b9aa5dc9 100644 --- a/misc-scripts/xref_mapping/XrefParser/MGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/MGDParser.pm @@ -4,11 +4,8 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - my $xref_sth ; my $dep_sth; @@ -51,11 +48,14 @@ sub run { my $mismatch = 0; my %mgi_good; - if(!open(FILE,"<". $file)){ - print "ERROR: Could not open file $file"; - return 1; # 1 is an error + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $file"; + return 1; # 1 is an error } - while(my $line = <FILE>){ + + while ( my $line = $file_io->getline() ) { chomp $line; my ($key,$label,$desc,$sps) = (split("\t",$line))[0,1,3,6]; my @sp = split(/\s/,$sps); @@ -70,19 +70,21 @@ sub run { } } } - close FILE; - + $file_io->close(); my $dir = dirname($file); my $syn_file = $dir."/MRK_Synonym.sql.rpt"; - if(!open(FILE2,"<". $syn_file)){ - print "ERROR: Could not open file $syn_file"; + $file_io = $self->get_filehandle($syn_file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $syn_file"; return 1; } + my $synonyms=0; - while(<FILE2>){ + while ( $_ = $file_io->getline() ) { if(/MGI:/){ chomp ; my ($key,$syn) = (split)[0,4]; @@ -92,23 +94,15 @@ sub run { } } } - close FILE2; + + $file_io->close(); + print "\t$count xrefs succesfully loaded\n"; print "\t$synonyms synonyms successfully loaded\n"; print "\t$mismatch xrefs failed to load\n"; return 0; - - -} - -sub new { - - my $self = {}; - bless $self, "XrefParser::MGDParser"; - return $self; - } - + 1; diff --git a/misc-scripts/xref_mapping/XrefParser/MIMParser.pm b/misc-scripts/xref_mapping/XrefParser/MIMParser.pm index a21828179f..c634c09925 100644 --- a/misc-scripts/xref_mapping/XrefParser/MIMParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/MIMParser.pm @@ -4,11 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -52,16 +48,21 @@ sub run { local $/ = "*RECORD*"; - if(!open(MIM,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error + my $mim_io = $self->get_filehandle($file); + + if ( !defined $mim_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error } - + my $gene = 0; my $phenotype = 0; my $removed_count =0; - <MIM>; # first record is empty with *RECORD* as the record seperator - while (<MIM>) { + + $mim_io->getline(); # first record is empty with *RECORD* as the + # record seperator + + while ( $_ = $mim_io->getline() ) { #get the MIM number my $number = 0; my $description = undef; @@ -101,6 +102,9 @@ sub run { } } } + + $mim_io->close(); + my $syn_count =0; foreach my $mim (keys %old_to_new){ my $old= $mim; @@ -118,14 +122,4 @@ sub run { return 0; #successful } - - -sub new { - - my $self = {}; - bless $self, "XrefParser::MIMParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm b/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm index 7d188c3a6a..a56b154227 100644 --- a/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/OTTTParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of Ensembl - Vega OTTT transcript mappings # ENST00000373795: OTTHUMT00000010392 @@ -19,17 +16,19 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(OTTT,"<".$file)){ + my $ottt_io = $self->get_filehandle($file); + + if ( !defined $ottt_io ) { print "Could not open $file\n"; return 1; } + my $line_count = 0; my $xref_count = 0; my $xref_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$source_id AND species_id=$species_id"); - while (<OTTT>) { - + while ( $_ = $ottt_io->getline() ) { my ($ens, $ottt) = split; $ens =~ s/://g; @@ -48,19 +47,12 @@ sub run { } - print "Parsed $line_count OTTT identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n"; + $ottt_io->close(); - close(OTTT); - return 0; -} - - -sub new { + print "Parsed $line_count OTTT identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n"; - my $self = {}; - bless $self, "XrefParser::OTTTParser"; - return $self; + return 0; } 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RGDParser.pm b/misc-scripts/xref_mapping/XrefParser/RGDParser.pm index 6eb169a375..8f147c02a8 100644 --- a/misc-scripts/xref_mapping/XrefParser/RGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RGDParser.pm @@ -4,10 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); my $xref_sth ; my $dep_sth; @@ -47,11 +44,14 @@ sub run { my (%refseq) = %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)}; - if(!open(RGD,"<".$file)){ + my $rgd_io = $self->get_filehandle($file); + + if ( !defined $rgd_io ) { print "ERROR: Could not open $file\n"; return 1; } - my $line = <RGD>; + + my $line = $rgd_io->getline(); chomp $line; my @linearr = split(/\t/,$line); @@ -74,7 +74,7 @@ sub run { my $count= 0; my $mismatch = 0; - while ($line = <RGD>) { + while ( $line = $rgd_io->getline() ) { chomp $line; my ($rgd, $symbol, $name, $refseq) = (split (/\t/,$line))[0,1,2,16]; my @nucs = split(/\,/,$refseq); @@ -94,6 +94,7 @@ sub run { } } } + if(!$done){ # print STDERR "$rgd FAILED for $failed_list\n"; $self->add_xref("RGD:".$rgd,"",$symbol,$name,$source_id,$species_id); @@ -101,18 +102,12 @@ sub run { } } + + $rgd_io->close(); + print "\t$count xrefs succesfully loaded and dependent on refseq\n"; print "\t$mismatch xrefs added but with NO dependencies\n"; return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::RGDParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm index 9a1392014f..efad047380 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm @@ -6,10 +6,7 @@ use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw( XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -49,8 +46,10 @@ sub run { my $pred_dna_source_id = XrefParser::BaseParser->get_source_id_for_source_name('RefSeq_dna_predicted'); print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id; RefSeq_dna_predicted source ID = $pred_dna_source_id\n"; + my $xrefs = + $self->create_xrefs( $peptide_source_id, $dna_source_id, + $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id ); - my $xrefs = create_xrefs($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id); if(!defined($xrefs)){ return 1; #error } @@ -68,8 +67,10 @@ sub run { # Slightly different formats sub create_xrefs { + my $self = shift; - my ($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id) = @_; + my ( $peptide_source_id, $dna_source_id, $pred_peptide_source_id, + $pred_dna_source_id, $file, $species_id ) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); @@ -78,10 +79,13 @@ sub create_xrefs { # my (%genemap) = %{XrefParser::BaseParser->get_valid_codes("mim_gene",$species_id)}; # my (%morbidmap) = %{XrefParser::BaseParser->get_valid_codes("mim_morbid",$species_id)}; - if(!open(REFSEQ, $file)){ + my $refseq_io = $self->get_filehandle($file); + + if ( !defined $refseq_io ) { print "ERROR: Can't open RefSeqGPFF file $file\n"; return undef; } + my @xrefs; local $/ = "\/\/\n"; @@ -109,7 +113,7 @@ sub create_xrefs { } - while (<REFSEQ>) { + while ( $_ = $refseq_io->getline() ) { my $xref; @@ -224,7 +228,7 @@ sub create_xrefs { } # while <REFSEQ> - close (REFSEQ); + $refseq_io->close(); print "Read " . scalar(@xrefs) ." xrefs from $file\n"; @@ -234,14 +238,4 @@ sub create_xrefs { # -------------------------------------------------------------------------------- -sub new { - - my $self = {}; - bless $self, "XrefParser::RefSeqGPFFParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm index 1557f53fc5..1625701a10 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm @@ -6,10 +6,7 @@ use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -46,7 +43,10 @@ sub run { $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); } - my $xrefs = create_xrefs($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id); + my $xrefs = + $self->create_xrefs( $peptide_source_id, $dna_source_id, + $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id ); + if(!defined($xrefs)){ return 1; #error } @@ -65,21 +65,25 @@ sub run { # Slightly different formats sub create_xrefs { + my $self = shift; - my ($peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id) = @_; + my ( $peptide_source_id, $dna_source_id, $pred_peptide_source_id, + $pred_dna_source_id, $file, $species_id ) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); - if(!open(REFSEQ, $file)){ - print "ERROR: Can't open RefSeq file $file\n"; - return undef; + my $refseq_io = $self->get_filehandle($file); + + if ( !defined $refseq_io ) { + print "ERROR: Can't open RefSeq file $file\n"; + return undef; } + my @xrefs; local $/ = "\n>"; - while (<REFSEQ>) { - + while ( $_ = $refseq_io->getline() ) { my $xref; my $entry = $_; @@ -92,7 +96,7 @@ sub create_xrefs { (my $gi, my $n, my $ref, my $acc, my $description) = split(/\|/, $header); my ($species, $mrna); - if ($file =~ /\.faa$/) { + if ($file =~ /\.faa(\.gz|\.Z)?$/) { ($mrna, $description, $species) = $description =~ /(\S*)\s+(.*)\s+\[(.*)\]$/; $xref->{SEQUENCE_TYPE} = 'peptide'; @@ -105,7 +109,7 @@ sub create_xrefs { } $xref->{SOURCE_ID} = $source_id; - } elsif ($file =~ /\.fna$/) { + } elsif ($file =~ /\.fna(\.gz|\.Z)?$/) { ($species, $description) = $description =~ /\s*(\w+\s+\w+)\s+(.*)$/; $xref->{SEQUENCE_TYPE} = 'dna'; @@ -126,8 +130,10 @@ sub create_xrefs { my $species_id_check = $name2species_id{$species}; # skip xrefs for species that aren't in the species table - if (defined($species_id) and $species_id == $species_id_check) { - + if ( defined $species_id + && defined $species_id_check + && $species_id == $species_id_check ) + { my ($acc_no_ver,$ver) = split (/\./,$acc); $xref->{ACCESSION} = $acc_no_ver; $xref->{VERSION} = $ver; @@ -144,7 +150,7 @@ sub create_xrefs { } - close (REFSEQ); + $refseq_io->close(); print "Read " . scalar(@xrefs) ." xrefs from $file\n"; @@ -154,14 +160,4 @@ sub create_xrefs { # -------------------------------------------------------------------------------- -sub new { - - my $self = {}; - bless $self, "XrefParser::RefSeqParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm index e1dcec0067..96a376aae4 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeq_CCDSParser.pm @@ -4,10 +4,7 @@ use strict; use DBI; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Parse file of Refseq records and assign direct xrefs @@ -15,13 +12,13 @@ sub run { my ($self, $file, $source_id, $species_id) = @_; - if(!open(REFSEQ,"<".$file)){ + my $refseq_io = $self->get_filehandle($file); + + if ( defined $refseq_io ) { print "Could not open $file\n"; return 1; } - - # becouse the direct mapping have no descriptions etc # we have to steal these from the previous Refseq parser. @@ -67,8 +64,10 @@ sub run { my $xref_count = 0; my %seen; my %old_to_new; - <REFSEQ>; # header - while(<REFSEQ>){ + + $refseq_io->getline(); # header + + while ( $_ = $refseq_io->getline() ) { chomp; my ($ccds,$refseq) = split; @@ -102,19 +101,12 @@ sub run { } } - print "Parsed $line_count RefSeq_dna identifiers from $file, added $xref_count xrefs and $xref_count direct_xrefs from $line_count lines.\n"; - - close(REFSEQ); - return 0; - -} + $refseq_io->close(); + print "Parsed $line_count RefSeq_dna identifiers from $file, added $xref_count xrefs and $xref_count direct_xrefs from $line_count lines.\n"; -sub new { - my $self = {}; - bless $self, "XrefParser::RefSeq_CCDSParser"; - return $self; + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/SGDParser.pm b/misc-scripts/xref_mapping/XrefParser/SGDParser.pm index d6beb15afd..8d1f89f604 100644 --- a/misc-scripts/xref_mapping/XrefParser/SGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/SGDParser.pm @@ -4,11 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -40,15 +36,17 @@ sub run { - if(!open(SGD,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 is an error + my $sgd_io = $self->get_filehandle($file); + + if ( !defined $sgd_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 is an error } my $xref_count =0; my $syn_count =0; - while (<SGD>) { + while ( $_ = $sgd_io->getline() ) { chomp; my ($locus_name, $alias_name, $desc, $gene_prod, $phenotype, $orf_name, $sgd_id) = split(/\t/,$_); @@ -60,18 +58,11 @@ sub run { $syn_count++; } } + + $sgd_io->close(); + print $xref_count." SGD Xrefs added with $syn_count synonyms\n"; return 0; #successful } - - -sub new { - - my $self = {}; - bless $self, "XrefParser::SGDParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm b/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm index 8d5605cb3b..8ff59a44db 100644 --- a/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm @@ -4,10 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,11 +38,14 @@ sub run { my %name_2_source_id=(); my $added=0; - if(!open(FILE,"<". $file)){ - print "ERROR: Could not open file $file\n"; + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $file\n"; return 1; } - while(my $line = <FILE>){ + + while ( my $line = $file_io->getline() ) { chomp $line; my ($gene_id,$transcript_id,$source_name,$acc,$display_label,$description, $status) = split("\t",$line); @@ -68,19 +68,11 @@ sub run { #the those mapped to the transcript to the genes anyway due to the #biomart check } - close FILE; + + $file_io->close(); print "Added $added Xrefs for Gene segments\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::SegmentParser"; - return $self; - -} - - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm b/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm index 1d6f7e1faf..fd0680d935 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm @@ -6,10 +6,7 @@ use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,7 +38,11 @@ sub run { if(!defined($species_id)){ $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); } -my $xrefs =create_xrefs($unigene_source_id, $unigene_source_id, $file, $species_id); + + my $xrefs = + $self->create_xrefs( $unigene_source_id, $unigene_source_id, $file, + $species_id ); + if(!defined($xrefs)){ return 1; #error } @@ -56,23 +57,24 @@ my $xrefs =create_xrefs($unigene_source_id, $unigene_source_id, $file, $species_ my %geneid_2_desc; sub get_desc{ + my $self = shift; my $file = shift; + my $dir = dirname($file); - (my $name) = $file =~ /\/(\w+)\.seq\.uniq/; print $name."\n"; local $/ = "//"; + my $desc_io = $self->get_filehandle( $dir . '/' . $name . '.data' ); - if(!open (DESC, "$dir/$name.data")){ + if ( !defined $desc_io ) { print "ERROR: Can't open $dir/$name.data\n"; return undef; } - while(<DESC>){ - + while ( $_ = $desc_io->getline() ) { #ID Hs.159356 #TITLE Hypothetical LOC388277 @@ -82,21 +84,27 @@ sub get_desc{ $geneid_2_desc{$id} = $descrip; } + + $desc_io->close(); + return 1; } sub create_xrefs { + my $self = shift; - my ($peptide_source_id, $unigene_source_id, $file, $species_id) = @_; + my ( $peptide_source_id, $unigene_source_id, $file, $species_id ) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); - if(!defined(get_desc($file))){ + if ( !defined( $self->get_desc($file) ) ) { return undef; } - if(!open(UNIGENE, $file)){ + my $unigene_io = $self->get_filehandle($file); + + if ( !defined $unigene_io ) { print "Can't open RefSeq file $file\n"; return undef; } @@ -110,7 +118,7 @@ sub create_xrefs { local $/ = "\n>"; - while (<UNIGENE>) { + while ( $_ = $unigene_io->getline() ) { my $xref; @@ -150,7 +158,8 @@ sub create_xrefs { } - close (UNIGENE); + $unigene_io->close(); + %geneid_2_desc=(); print "Read " . scalar(@xrefs) ." xrefs from $file\n"; @@ -158,16 +167,4 @@ sub create_xrefs { } -# -------------------------------------------------------------------------------- - -sub new { - - my $self = {}; - bless $self, "XrefParser::UniGeneParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm index 97c5dcdf03..e15ad5a66a 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm @@ -13,10 +13,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -54,7 +51,10 @@ sub run { print "SpTREMBL source id for $file: $sptr_source_id\n"; - my @xrefs = create_xrefs($sp_source_id, $sptr_source_id, $species_id, $file); + my @xrefs = + $self->create_xrefs( $sp_source_id, $sptr_source_id, $species_id, + $file ); + if ( !@xrefs ) { return 1; # 1 error } @@ -110,8 +110,9 @@ sub get_species { # Parse file into array of xref objects sub create_xrefs { + my $self = shift; - my ($sp_source_id, $sptr_source_id, $species_id, $file) = @_; + my ( $sp_source_id, $sptr_source_id, $species_id, $file ) = @_; my $num_sp = 0; my $num_sptr = 0; @@ -135,15 +136,14 @@ sub create_xrefs { my (%genemap) = %{XrefParser::BaseParser->get_valid_codes("mim_gene",$species_id)}; my (%morbidmap) = %{XrefParser::BaseParser->get_valid_codes("mim_morbid",$species_id)}; - if(!open(UNIPROT, $file)){ - print"Can't open Swissprot file $file\n"; - return undef; - } + my $uniprot_io = $self->get_filehandle($file); + if ( !defined $uniprot_io ) { return undef } + my @xrefs; - local $/ = "\/\/\n"; + local $/ = "//\n"; - while (<UNIPROT>) { + while ( $_ = $uniprot_io->getline() ) { # if an OX line exists, only store the xref if the taxonomy ID that the OX # line refers to is in the species table @@ -353,7 +353,7 @@ sub create_xrefs { } - close (UNIPROT); + $uniprot_io->close(); print "Read $num_sp SwissProt xrefs and $num_sptr SPTrEMBL xrefs from $file\n"; print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if ($num_sp_pred > 0 || $num_sptr_pred > 0); @@ -363,16 +363,4 @@ sub create_xrefs { #TODO - currently include records from other species - filter on OX line?? } -# -------------------------------------------------------------------------------- - -sub new { - - my $self = {}; - bless $self, "XrefParser::UniProtParser"; - return $self; - -} - -# -------------------------------------------------------------------------------- - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm index 0486f04a67..9001395c49 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm @@ -5,10 +5,7 @@ package XrefParser::UniProtVarSplicParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # UniProtVarSplic file format: fasta, e.g. @@ -27,17 +24,18 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ - print "ERROR: Could not open $file\n"; - return 1; # 1 error + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open $file\n"; + return 1; # 1 error } my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)}; my $missed = 0; - while (<FILE>) { - + while ( $_ = $file_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -69,7 +67,7 @@ sub run { } } - close (FILE); + $file_io->close(); print $missed." ignored as original uniprot not found in database\n"; print scalar(@xrefs) . " UniProtVarSplic xrefs succesfully parsed\n"; @@ -80,15 +78,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::UniProtVarSplicParser"; - return $self; - -} - 1; - - diff --git a/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm b/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm index fcd0eda3da..df059b421c 100644 --- a/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm @@ -2,16 +2,12 @@ package XrefParser::WilsonAffyParser; use strict; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); my $xref_sth ; my $dep_sth; my $syn_sth; - - sub run { my ($self, $file, $source_id, $species_id) = @_; @@ -39,14 +35,16 @@ sub create_xrefs { my @xrefs; - if(!open(FILE,"<".$file)){ + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } - <FILE>; # skip first line - while (<FILE>) { + $file_io->getline(); # skip first line + while ( $_ = $file_io->getline() ) { #last if ($count > 200); my $xref; @@ -79,10 +77,13 @@ sub create_xrefs { # fetch sequence for others (EMBL ESTs and RefSeqs - pfetch will handle these) system ("pfetch -q $target > seq.txt"); - open(SEQ, "<seq.txt"); - my $seq = <SEQ>; + + my $seq_io = $self->get_filehandle('seq.txt'); + + my $seq = $seq_io->getline(); + $seq_io->close(); + chomp($seq); - close(SEQ); if ($seq && $seq !~ /no match/) { @@ -116,7 +117,7 @@ sub create_xrefs { } - close(FILE); + $file_io->close(); print "\n\nParsed $count primary xrefs.\n"; print "Couldn't get sequence for $noseq primary_xrefs\n" if ($noseq); @@ -126,12 +127,4 @@ sub create_xrefs { } -sub new { - - my $self = {}; - bless $self, "XrefParser::WilsonAffyParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm b/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm index c31253be7d..8f27dbfb86 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm @@ -3,10 +3,8 @@ package XrefParser::WormPepParser; use strict; use File::Basename; -use XrefParser::BaseParser; +use base qw( XrefParser::BaseParser ); -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); my $xref_sth ; my $dep_sth; @@ -31,15 +29,17 @@ sub run { my $xref_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$worm_source_id AND species_id=$species_id"); my $xref_sth2 = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$worm_locus_id AND species_id=$species_id"); - if(!open(PEP,"<".$file)){ + my $pep_io = $self->get_filehandle($file); + + if ( !defined $pep_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } - my ($x_count, $d_count); + my ($x_count, $d_count); - while (<PEP>) { + while ( $_ = $pep_io->getline() ) { my ($transcript, $wb, $display) = (split(/\t/,substr($_,1)))[0,1,2]; # reuse or create xref @@ -69,19 +69,10 @@ sub run { $d_count++; } - close (PEP); + $pep_io->close(); print "Added $d_count direct xrefs and $x_count xrefs\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::WormPepParser"; - return $self; - -} - 1; - diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm index c18fa62366..92d43cabed 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm @@ -7,10 +7,7 @@ package XrefParser::WormbaseDatabaseStableIDParser; use strict; -use XrefParser::DatabaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::DatabaseParser); +use base qw( XrefParser::DatabaseParser ); sub run { @@ -46,13 +43,5 @@ sub run { return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::WormbaseDatabaseStableIDParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm b/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm index 452855aea8..02c1e7591f 100644 --- a/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm @@ -5,10 +5,7 @@ package XrefParser::XenopusJamboreeParser; use strict; use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # Xenopus Jamboree peptides file format: fasta, e.g. @@ -24,14 +21,16 @@ sub run { local $/ = "\n>"; - if(!open(FILE,"<".$file)){ + my $file_io = $self->getline($file); + + if ( !defined $file_io ) { print "ERROR: Could not open $file\n"; - return 1; # 1 error + return 1; # 1 error } - my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); - while (<FILE>) { + my $species_tax_id = $self->get_taxonomy_from_species_id($species_id); + while ( $_ = $file_io->getline() ) { my $xref; my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n"); @@ -57,7 +56,7 @@ sub run { } - close (FILE); + $file_io->close(); print scalar(@xrefs) . " XenopusJamboreeParser xrefs succesfully parsed\n"; @@ -69,13 +68,4 @@ sub run { return 0; } - -sub new { - - my $self = {}; - bless $self, "XrefParser::XenopusJamboreeParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm b/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm index 4875998fe4..83ceda1bcf 100644 --- a/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm @@ -4,12 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); - - +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -45,10 +40,13 @@ sub run { my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)}; my (%refseq) = %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)}; - if(!open(SWISSPROT,"<".$dir."/swissprot.txt")){ - print "ERROR: Could not open $dir/swissprot.txt\n"; - return 1; # 1 error + my $swissprot_io = $self->get_filehandle( $dir . '/swissprot.txt' ); + + if ( !defined $swissprot_io ) { + print "ERROR: Could not open $dir/swissprot.txt\n"; + return 1; # 1 error } + #e.g. #ZDB-GENE-000112-30 couptf2 O42532 #ZDB-GENE-000112-32 couptf3 O42533 @@ -58,7 +56,8 @@ sub run { my $spcount =0; my $rscount =0; my $mismatch=0; - while (<SWISSPROT>) { + + while ( $_ = $swissprot_io->getline() ) { chomp; my ($zfin, $label, $acc) = split (/\s+/,$_); if(defined($swiss{$acc})){ @@ -69,16 +68,21 @@ sub run { $mismatch++; } } - close SWISSPROT; - - if(!open(REFSEQ,"<".$dir."/refseq.txt")){ - print "ERROR: Could not open $dir/refseq.txt\n"; + + $swissprot_io->close(); + + my $refseq_io = $self->get_filehandle( $dir . '/refseq.txt' ); + + if ( !defined $refseq_io ) { + print "ERROR: Could not open $dir/refseq.txt\n"; return 1; } + #ZDB-GENE-000125-12 igfbp2 NM_131458 #ZDB-GENE-000125-12 igfbp2 NP_571533 #ZDB-GENE-000125-4 dlc NP_571019 - while (<REFSEQ>) { + + while ( $_ = $refseq_io->getline() ) { chomp; my ($zfin, $label, $acc) = split (/\s+/,$_); if(defined($refseq{$acc})){ @@ -89,18 +93,12 @@ sub run { $mismatch++; } } - close REFSEQ; + + $refseq_io->close(); + print "\t$spcount xrefs from Swissprot and $rscount xrefs from RefSeq succesfully loaded\n"; print "\t$mismatch xrefs ignored\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::ZFINParser"; - return $self; - -} - 1; diff --git a/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm b/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm index 92710ba9cc..8ca57774fb 100644 --- a/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm @@ -4,10 +4,7 @@ use strict; use POSIX qw(strftime); use File::Basename; -use XrefParser::BaseParser; - -use vars qw(@ISA); -@ISA = qw(XrefParser::BaseParser); +use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -41,11 +38,14 @@ sub run { my %name_2_source_id=(); my $added=0; - if(!open(FILE,"<". $file)){ - print "ERROR: Could not open file $file\n"; + my $file_io = $self->get_filehandle($file); + + if ( !defined $file_io ) { + print "ERROR: Could not open file $file\n"; return 1; } - while(my $line = <FILE>){ + + while ( my $line = $file_io->getline() ) { chomp $line; my ($gene_id,$transcript_id,$source_name,$acc,$display_label,$full_description, $status) = split("\t",$line); @@ -79,19 +79,11 @@ sub run { #biomart check # $self->add_direct_xref($xref_id, $gene_id, "Gene", "") if (defined($gene_id)); } - close FILE; + + $file_io->close(); print "Added $added Xrefs for ncRNAs\n"; return 0; } -sub new { - - my $self = {}; - bless $self, "XrefParser::ncRNAParser"; - return $self; - -} - - 1; diff --git a/misc-scripts/xref_mapping/xref_parser.pl b/misc-scripts/xref_mapping/xref_parser.pl index 5d401928e1..2fe667106f 100644 --- a/misc-scripts/xref_mapping/xref_parser.pl +++ b/misc-scripts/xref_mapping/xref_parser.pl @@ -3,36 +3,45 @@ use strict; use Getopt::Long; use XrefParser::BaseParser; -my ($host, $port, $dbname, $user, $pass, @species, @sources, $skipdownload, $checkdownload, $create, $release, $cleanup, $drop_existing_db, $deletedownloaded, $dl_path, @notsource); - -GetOptions('dbuser|user=s' => \$user, - 'dbpass|pass=s' => \$pass, - 'dbhost|host=s' => \$host, - 'dbport|port=i' => \$port, - 'dbname=s' => \$dbname, - 'species=s' => \@species, - 'source=s' => \@sources, - 'download_dir=s' => \$dl_path, - 'skipdownload' => \$skipdownload, # skips all downloads - 'checkdownload!' => \$checkdownload, # if file exists it won't be downloaded - 'create' => \$create, - 'setrelease=s' => \$release, - 'cleanup' => \$cleanup, - 'notsource=s' => \@notsource, - 'drop_db|dropdb!' => \$drop_existing_db, # drops xref db without user interaction - 'delete_downloaded' => \$deletedownloaded, - 'download_path=s' => \$dl_path, - 'help' => sub { usage(); exit(0); }); +my ( + $host, $port, $dbname, + $user, $pass, @species, + @sources, $skipdownload, $checkdownload, + $create, $release, $cleanup, + $drop_existing_db, $deletedownloaded, $dl_path, + @notsource, $compressed +); + +GetOptions( + 'dbuser|user=s' => \$user, + 'dbpass|pass=s' => \$pass, + 'dbhost|host=s' => \$host, + 'dbport|port=i' => \$port, + 'dbname=s' => \$dbname, + 'species=s' => \@species, + 'source=s' => \@sources, + 'download_dir=s' => \$dl_path, + 'skipdownload' => \$skipdownload, # skips all downloads + 'checkdownload!' => \$checkdownload, # don't download if exists + 'create' => \$create, + 'setrelease=s' => \$release, + 'cleanup' => \$cleanup, + 'notsource=s' => \@notsource, + 'drop_db|dropdb!' => + \$drop_existing_db, # drops xref db without user interaction + 'delete_downloaded' => \$deletedownloaded, + 'download_path=s' => \$dl_path, + 'compressed' => \$compressed, # don't force decompression of files + 'help' => sub { usage(); exit(0); } +); @species = split(/,/,join(',',@species)); @sources = split(/,/,join(',',@sources)); -if (!$user || !$host || !$dbname) { - - usage(); - exit(1); - +if ( !$user || !$host || !$dbname ) { + usage(); + exit(1); } XrefParser::BaseParser::run( @@ -43,7 +52,8 @@ XrefParser::BaseParser::run( $checkdownload, $create, $release, $cleanup, $drop_existing_db, $deletedownloaded, - $dl_path, \@notsource + $dl_path, \@notsource, + $compressed ); # -------------------------------------------------------------------------------- -- GitLab