From fa3101e506ffbe9947880cbe679edf90b4e71940 Mon Sep 17 00:00:00 2001 From: Ian Longden <ianl@sanger.ac.uk> Date: Tue, 2 Sep 2008 10:02:10 +0000 Subject: [PATCH] -verbose option added which also need to change run subroutine to get the file/s via a ref to and array then an array --- .../XrefParser/AedesGenBankParser.pm | 15 +- .../xref_mapping/XrefParser/AgilentParser.pm | 18 +- .../XrefParser/AnophelesSymbolParser.pm | 16 +- .../xref_mapping/XrefParser/BaseParser.pm | 249 ++++++++++-------- .../xref_mapping/XrefParser/CeleraParser.pm | 15 +- .../xref_mapping/XrefParser/CodelinkParser.pm | 16 +- .../xref_mapping/XrefParser/DBASSParser.pm | 42 ++- .../xref_mapping/XrefParser/DirectParser.pm | 2 +- .../XrefParser/EntrezGeneParser.pm | 15 +- .../xref_mapping/XrefParser/FastaParser.pm | 17 +- .../xref_mapping/XrefParser/FlybaseParser.pm | 23 +- .../XrefParser/Flybase_dmel_GFFv3_Parser.pm | 27 +- .../xref_mapping/XrefParser/GOParser.pm | 25 +- .../xref_mapping/XrefParser/HGNCParser.pm | 43 +-- .../xref_mapping/XrefParser/HPAParser.pm | 28 +- .../xref_mapping/XrefParser/IPIParser.pm | 16 +- .../xref_mapping/XrefParser/IlluminaParser.pm | 16 +- .../XrefParser/IlluminaWGParser.pm | 17 +- .../XrefParser/InterproGoParser.pm | 14 +- .../xref_mapping/XrefParser/InterproParser.pm | 47 ++-- .../xref_mapping/XrefParser/JGI_Parser.pm | 19 +- .../xref_mapping/XrefParser/MGDParser.pm | 8 +- .../xref_mapping/XrefParser/MIMParser.pm | 14 +- .../xref_mapping/XrefParser/RGDParser.pm | 10 +- .../XrefParser/RefSeqGPFFParser.pm | 30 ++- .../xref_mapping/XrefParser/RefSeqParser.pm | 21 +- .../xref_mapping/XrefParser/SGDParser.pm | 14 +- .../xref_mapping/XrefParser/SegmentParser.pm | 13 +- .../xref_mapping/XrefParser/UCSCParser.pm | 11 +- .../xref_mapping/XrefParser/UniGeneParser.pm | 22 +- .../xref_mapping/XrefParser/UniProtParser.pm | 44 ++-- .../UniProtParser_descriptions_only.pm | 37 +-- .../XrefParser/UniProtVarSplicParser.pm | 20 +- .../xref_mapping/XrefParser/VbDirectParser.pm | 14 +- .../xref_mapping/XrefParser/VbGFF3Parser.pm | 16 +- .../xref_mapping/XrefParser/VegaParser.pm | 32 ++- .../XrefParser/Vega_TranParser.pm | 6 +- .../XrefParser/WilsonAffyParser.pm | 29 +- .../xref_mapping/XrefParser/WormPepParser.pm | 16 +- .../WormbaseDatabaseStableIDParser.pm | 5 +- .../XrefParser/XenopusJamboreeParser.pm | 15 +- .../xref_mapping/XrefParser/ZFINParser.pm | 30 ++- .../xref_mapping/XrefParser/ncRNAParser.pm | 8 +- 43 files changed, 668 insertions(+), 427 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm b/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm index ed24106d2c..3699a7e221 100644 --- a/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AedesGenBankParser.pm @@ -13,7 +13,16 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; + my $cpt = 0 ; next if (/^File:/); # skip header @@ -38,7 +47,7 @@ sub run { if ($header eq "") { $header = "Aedes_GenBank".$cpt ; - print STDERR "One sequence with a random name ... \n" ; + print STDERR "One sequence with a random name ... \n" if($verbose); $cpt++ ; } @@ -74,7 +83,7 @@ sub run { $file_io->close(); - print scalar(@xrefs) . " AedesGenBank xrefs succesfully parsed\n"; + print scalar(@xrefs) . " AedesGenBank xrefs succesfully parsed\n" if($verbose); XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); diff --git a/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm b/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm index 4fdbee3847..3ee69d8651 100644 --- a/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AgilentParser.pm @@ -13,16 +13,22 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); - my @xrefs; + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; -# local $/ = "\n>"; + my @xrefs; my $ag_io = $self->get_filehandle($file); if ( !defined $ag_io ) { - print "Could not open $file\n"; + print STDERR "Could not open $file\n"; return 1; } @@ -59,11 +65,11 @@ sub run { $ag_io->close(); - print scalar(@xrefs) . " Agilent xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); - print "Done\n"; + print scalar(@xrefs) . " Agilent xrefs succesfully parsed\n" if($verbose); + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm b/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm index 76294ccfa7..59e24b7f3e 100644 --- a/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/AnophelesSymbolParser.pm @@ -14,7 +14,15 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; next if (/^File:/); # skip header @@ -25,7 +33,7 @@ sub run { my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "Could not open $file\n"; + print STDERR "Could not open $file\n"; return 1; } @@ -56,11 +64,11 @@ sub run { $file_io->close(); - print scalar(@xrefs) . " AnophelesSymbol xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); - print "Done\n"; + print scalar(@xrefs) . " AnophelesSymbol xrefs succesfully parsed\n" if($verbose); + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/BaseParser.pm b/misc-scripts/xref_mapping/XrefParser/BaseParser.pm index e7723647f3..93383a7075 100644 --- a/misc-scripts/xref_mapping/XrefParser/BaseParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/BaseParser.pm @@ -38,7 +38,8 @@ my %xref_dependent_mapped; my ( $host, $port, $dbname, $user, $pass, $create, $release, $cleanup, $deletedownloaded, $drop_db, $checkdownload, $dl_path, - $unzip, $stats ); + $unzip, $stats, $verbose); + # -------------------------------------------------------------------------------- # Get info about files to be parsed from the database @@ -51,7 +52,7 @@ sub run { my $sourcesr, $checkdownload, $create, $release, $cleanup, $drop_db, $deletedownloaded, $dl_path, my $notsourcesr, - $unzip, $stats + $unzip, $stats, $verbose ) = @_; $base_dir = $dl_path if $dl_path; @@ -140,7 +141,7 @@ sub run { while ( my @row = $sth->fetchrow_array() ) { - print '-' x 4, "{ $name }", '-' x ( 72 - length($name) ), "\n"; + print '-' x 4, "{ $name }", '-' x ( 72 - length($name) ), "\n" if ($verbose); my $cs; my $file_cs = ""; @@ -176,21 +177,32 @@ sub run { # Database parsing if ( $file =~ /^mysql:/i ) { $dsn = $file; - print "Parsing $dsn with $parser\n"; + print "Parsing $dsn with $parser\n" if ($verbose); eval "require XrefParser::$parser"; my $new = "XrefParser::$parser"->new(); if ( $new->run( $dsn, $source_id, $species_id, - $name, undef ) ) + $name, undef, $verbose ) ) { ++$summary{$name}->{$parser}; } next; } - - - if ( $unzip && ( $file =~ /\.(gz|Z)$/ ) ) { - printf( "Uncompressing '%s' using 'gunzip'\n", $file ); + if ( $file =~ /^script:/i ) { + print "Parsing $file with $parser\n" if ($verbose); + eval "require XrefParser::$parser"; + my $new = "XrefParser::$parser"->new(); + if ( + $new->run_script( $file, $source_id, $species_id, $verbose ) ) + { + ++$summary{$name}->{$parser}; + } + next; + } + + + if ( $unzip && ( $file =~ /\.(gz|Z)$/ ) ) { + printf( "Uncompressing '%s' using 'gunzip'\n", $file ) if ($verbose); system( "gunzip", "-f", $file ); } if ($unzip) { $file =~ s/\.(gz|Z)$// } @@ -199,7 +211,7 @@ sub run { # check file size as some .SPC files can be of zero length if ( !defined( $cs = md5sum($file) ) ) { - printf( "Download '%s'\n", $file ); + printf( "Download '%s'\n", $file ) if($verbose); ++$summary{$name}->{$parser}; } else { $file_cs .= ':' . $cs; @@ -209,7 +221,7 @@ sub run { if ( -s $file ) { $parse = 1; print "Checksum for '$file' does not match, " - . "will parse...\n"; + . "will parse...\n" if ($verbose); # Files from sources "Uniprot/SWISSPROT" and # "Uniprot/SPTREMBL" are all parsed with the @@ -223,7 +235,7 @@ sub run { $empty = 1; printf( "The file '%s' has zero length, skipping\n", - $file ); + $file ) if ($verbose); } } } ## end else [ if ( !defined( $cs = md5sum... @@ -237,7 +249,7 @@ sub run { if ( $parse and @files_to_parse and defined $file_cs ) { print "Parsing '" . join( "', '", @files_to_parse ) - . "' with $parser\n"; + . "' with $parser\n" if ($verbose); eval "require XrefParser::$parser"; $@ && warn( "[ERROR] Cannot require $parser: $@" ); @@ -247,7 +259,7 @@ sub run { # Run with $release_url. if ( $new->run( $source_id, $species_id, - @files_to_parse, $release_url ) ) + \@files_to_parse, $release_url, $verbose ) ) { ++$summary{$name}->{$parser}; } @@ -255,7 +267,7 @@ sub run { # Run without $release_url. if ( $new->run( $source_id, $species_id, - @files_to_parse ) ) + \@files_to_parse, undef, $verbose ) ) { ++$summary{$name}->{$parser}; } @@ -273,12 +285,12 @@ sub run { } elsif ( !$dsn && !$empty && @files_to_parse ) { print( "Ignoring '" . join( "', '", @files_to_parse ) - . "' as checksums match\n" ); + . "' as checksums match\n" ) if ($verbose); } if ($cleanup) { foreach my $file (@files_to_parse) { - printf( "Deleting '%s'\n", $file ); + printf( "Deleting '%s'\n", $file ) if($verbose); unlink($file); } } @@ -410,7 +422,7 @@ sub run { print join("\t",@{$sum_line{$sum_name}})."\n"; } - } + } # if ($stats) } ## end while ( my @row = $sth->fetchrow_array... @@ -418,20 +430,21 @@ sub run { print "\n", '=' x 80, "\n"; print "Summary of status\n"; print '=' x 80, "\n"; - + + foreach my $source_name ( sort keys %summary ) { - foreach my $parser_name ( keys %{ $summary{$source_name} } ) { - printf( "%30s %-20s\t%s\n", - $source_name, - $parser_name, ( - defined $summary{$source_name}->{$parser_name} - && $summary{$source_name}->{$parser_name} - ? 'FAILED' - : 'OKAY' - ) ); - } + foreach my $parser_name ( keys %{ $summary{$source_name} } ) { + printf( "%30s %-20s\t%s\n", + $source_name, + $parser_name, ( + defined $summary{$source_name}->{$parser_name} + && $summary{$source_name}->{$parser_name} + ? 'FAILED' + : 'OKAY' + ) ); + } } - + if($stats){ my %sum_line; @@ -555,7 +568,11 @@ sub fetch_files { # Change old-style 'LOCAL:' URIs into 'file:'. $user_uri =~ s#^LOCAL:#file:#i; my $uri = URI->new($user_uri); - if ( $uri->scheme() eq 'file' ) { + +# print "\n*******$user_uri\n*********\n"; + if ( $uri->scheme() eq 'script'){ + push( @processed_files, $user_uri ); + }elsif ( $uri->scheme() eq 'file' ) { # Deal with local files. my @local_files; @@ -574,7 +591,7 @@ sub fetch_files { catfile( $dest_dir, basename( $uri->path() ) ); if ( $deletedownloaded && -f $file_path ) { - printf( "Deleting '%s'\n", $file_path ); + printf( "Deleting '%s'\n", $file_path ) if ($verbose); unlink($file_path); } @@ -584,12 +601,12 @@ sub fetch_files { # used (for globbing FTP URIs, we always need to connect # to a FTP site to see what files are there). - printf( "File '%s' already exists\n", $file_path ); + printf( "File '%s' already exists\n", $file_path ) if ($verbose); push( @processed_files, $file_path ); next; } - printf( "Connecting to FTP host '%s'\n", $uri->host() ); + printf( "Connecting to FTP host '%s'\n", $uri->host() ) if ($verbose); my $ftp = Net::FTP->new( $uri->host(), 'Debug' => 0 ); if ( !defined($ftp) ) { @@ -624,17 +641,17 @@ sub fetch_files { catfile( $dest_dir, basename($remote_file) ); if ( $deletedownloaded && -f $file_path ) { - printf( "Deleting '%s'\n", $file_path ); + printf( "Deleting '%s'\n", $file_path ) if($verbose); unlink($file_path); } if ( $checkdownload && -f $file_path ) { - printf( "File '%s' already exists\n", $file_path ); + printf( "File '%s' already exists\n", $file_path ) if ($verbose); } else { if ( !-d dirname($file_path) ) { printf( "Creating directory '%s'\n", - dirname($file_path) ); + dirname($file_path) ) if($verbose); if ( !mkdir( dirname($file_path) ) ) { printf( "==> Can not create directory '%s': %s", @@ -645,8 +662,8 @@ sub fetch_files { printf( "Fetching '%s' (size = %s)\n", $remote_file, - $ftp->size($remote_file) || '(unknown)' ); - printf( "Local file is '%s'\n", $file_path ); + $ftp->size($remote_file) || '(unknown)' ) if ($verbose); + printf( "Local file is '%s'\n", $file_path ) if($verbose); if ( !$ftp->get( $remote_file, $file_path ) ) { printf( "==> Could not get '%s': %s\n", @@ -667,7 +684,7 @@ sub fetch_files { catfile( $dest_dir, basename( $uri->path() ) ); if ( $deletedownloaded && -f $file_path ) { - printf( "Deleting '%s'\n", $file_path ); + printf( "Deleting '%s'\n", $file_path ) if($verbose); unlink($file_path); } @@ -675,14 +692,14 @@ sub fetch_files { # The file is already there, no need to connect to a # HTTP server. - printf( "File '%s' already exists\n", $file_path ); + printf( "File '%s' already exists\n", $file_path ) if ($verbose); push( @processed_files, $file_path ); next; } if ( !-d dirname($file_path) ) { printf( "Creating directory '%s'\n", - dirname($file_path) ); + dirname($file_path) ) if($verbose); if ( !mkdir( dirname($file_path) ) ) { printf( "==> Can not create directory '%s': %s", dirname($file_path), $! ); @@ -690,14 +707,14 @@ sub fetch_files { } } - printf( "Connecting to HTTP host '%s'\n", $uri->host() ); - printf( "Fetching '%s'\n", $uri->path() ); + printf( "Connecting to HTTP host '%s'\n", $uri->host() ) if($verbose); + printf( "Fetching '%s'\n", $uri->path() ) if($verbose); if ( $checkdownload && -f $file_path ) { - printf( "File '%s' already exists\n", $file_path ); + printf( "File '%s' already exists\n", $file_path ) if($verbose); } else { - printf( "Local file is '%s'\n", $file_path ); + printf( "Local file is '%s'\n", $file_path ) if($verbose); my $ua = LWP::UserAgent->new(); $ua->env_proxy(); @@ -766,7 +783,7 @@ sub get_filehandle if ( !defined $io ) { return undef } - print "Reading from '$file_name'...\n"; + print "Reading from '$file_name'...\n" if($verbose); return $io; } @@ -787,7 +804,7 @@ sub new sub get_source_id_for_filename { my ($self, $file) = @_; - print STDERR "FILE $file\n" ; + print "FILE $file\n" if($verbose) ; my $sql = "SELECT s.source_id FROM source s, source_url su WHERE su.source_id=s.source_id AND su.url LIKE '%/" . $file . "%'"; my $sth = dbi()->prepare($sql); $sth->execute(); @@ -1065,7 +1082,7 @@ sub get_existing_mappings { #print "mgi_to_uniprot{" . $row[1] . "} = " . $row[2] . "\n"; } - print "Got " . scalar(keys(%mappings)) . " $from_source_name -> $to_source_name mappings\n"; + print "Got " . scalar(keys(%mappings)) . " $from_source_name -> $to_source_name mappings\n" if($verbose); return \%mappings; @@ -1078,7 +1095,7 @@ sub upload_xref_object_graphs { my ($self, $rxrefs) = @_; my $dbi = dbi(); - print "count = ".$#$rxrefs."\n"; + print "count = ".$#$rxrefs."\n" if($verbose); if ($#$rxrefs > -1) { @@ -1086,7 +1103,7 @@ sub upload_xref_object_graphs { # $self->delete_by_source($rxrefs); # upload new ones - print "Uploading xrefs\n"; + print "Uploading xrefs\n" if($verbose); my $xref_sth = $dbi->prepare("INSERT INTO xref (accession,version,label,description,source_id,species_id) VALUES(?,?,?,?,?,?)"); my $pri_insert_sth = $dbi->prepare("INSERT INTO primary_xref VALUES(?,?,?,?)"); my $pri_update_sth = $dbi->prepare("UPDATE primary_xref SET sequence=? WHERE xref_id=?"); @@ -1106,30 +1123,30 @@ sub upload_xref_object_graphs { return undef; } # Create entry in xref table and note ID - if(! $xref_sth->execute($xref->{ACCESSION}, + if(! $xref_sth->execute($xref->{ACCESSION}, $xref->{VERSION} || 0, $xref->{LABEL}, $xref->{DESCRIPTION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID})){ - if(!defined($xref->{SOURCE_ID})){ - print "your xref: $xref->{ACCESSION} does not have a source-id\n"; - return undef; - } - $xref_id = insert_or_select($xref_sth, $dbi->err, $xref->{ACCESSION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID}); - $xref_update_label_sth->execute($xref->{LABEL},$xref_id) if (defined($xref->{LABEL})); - $xref_update_descr_sth->execute($xref->{DESCRIPTION},$xref_id,) if (defined($xref->{DESCRIPTION})); - } - else{ - $xref_id = insert_or_select($xref_sth, $dbi->err, $xref->{ACCESSION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID}); - } - + if(!defined($xref->{SOURCE_ID})){ + print "your xref: $xref->{ACCESSION} does not have a source-id\n"; + return undef; + } + $xref_id = $self->insert_or_select($xref_sth, $dbi->err, $xref->{ACCESSION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID}); + $xref_update_label_sth->execute($xref->{LABEL},$xref_id) if (defined($xref->{LABEL})); + $xref_update_descr_sth->execute($xref->{DESCRIPTION},$xref_id,) if (defined($xref->{DESCRIPTION})); + } + else{ + $xref_id = $self->insert_or_select($xref_sth, $dbi->err, $xref->{ACCESSION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID}); + } + # create entry in primary_xref table with sequence; if this is a "cumulative" - # entry it may already exist, and require an UPDATE rather than an INSERT + # entry it may already exist, and require an UPDATE rather than an INSERT if(defined($xref->{SEQUENCE})){ if(!(defined($xref_id) and $xref_id)){ - print STDERR "xref_id is not set for :\n$xref->{ACCESSION}\n$xref->{LABEL}\n$xref->{DESCRIPTION}\n$xref->{SOURCE_ID}\n"; + print STDERR "xref_id is not set for :\n$xref->{ACCESSION}\n$xref->{LABEL}\n$xref->{DESCRIPTION}\n$xref->{SOURCE_ID}\n$xref->{SPECIES_ID}\n"; } if ( primary_xref_id_exists($xref_id) ) { $pri_update_sth->execute( $xref->{SEQUENCE}, $xref_id ) @@ -1161,7 +1178,7 @@ sub upload_xref_object_graphs { $dep{SOURCE_ID}, $xref->{SPECIES_ID}); - my $dep_xref_id = insert_or_select($xref_sth, $dbi->err, $dep{ACCESSION}, $dep{SOURCE_ID}, $xref->{SPECIES_ID}); + my $dep_xref_id = $self->insert_or_select($xref_sth, $dbi->err, $dep{ACCESSION}, $dep{SOURCE_ID}, $xref->{SPECIES_ID}); if($dbi->err){ print STDERR "dbi\t$dbi->err \n$dep{ACCESSION} \n $dep{SOURCE_ID} \n"; @@ -1194,8 +1211,8 @@ sub upload_xref_object_graphs { sub upload_direct_xrefs{ my ($self, $direct_xref) = @_; for my $dr(@$direct_xref) { - # print "having now direct-XREF : $dr->{ENSEMBL_STABLE_ID} \n" ; - my $general_xref_id = get_xref_id_by_accession_and_source($dr->{ACCESSION},$dr->{SOURCE_ID}); +# print "having now direct-XREF : ".$dr->{ENSEMBL_STABLE_ID}."\t".$dr->{SPECIES_ID}." \n" ; + my $general_xref_id = get_xref($dr->{ACCESSION},$dr->{SOURCE_ID},$dr->{SPECIES_ID}); if ($general_xref_id){ # print "direct_xref:\n$general_xref_id\n$dr->{ENSEMBL_STABLE_ID}\n$dr->{ENSEMBL_TYPE}\t$dr->{LINKAGE_XREF}\n\n"; $self->add_direct_xref($general_xref_id, $dr->{ENSEMBL_STABLE_ID},$dr->{ENSEMBL_TYPE},$dr->{LINKAGE_XREF}); @@ -1387,6 +1404,28 @@ sub update_source } +# -------------------------------------------------------------------------------- +sub dbi2{ + + my $self = shift; + my ($host, $port, $user, $dbname, $pass) = @_; + my $dbi2; + + if ( !defined $dbi2 || !$dbi2->ping() ) { + my $connect_string = + sprintf( "dbi:mysql:host=%s;port=%s;database=%s", + $host, $port, $dbname ); + + $dbi2 = + DBI->connect( $connect_string, $user, $pass, + { 'RaiseError' => 1 } ) + or croak( "Can't connect to database: " . $DBI::errstr ); + $dbi2->{'mysql_auto_reconnect'} = 1; # Reconnect on timeout + } + + return $dbi2; +} + # -------------------------------------------------------------------------------- sub dbi @@ -1432,7 +1471,7 @@ sub md5sum # -------------------------------------------------------------------------------- -sub get_xref_id_by_accession_and_source { +sub get_xref_id_by_accession_and_source_OLD { my ($acc, $source_id, $species_id ) = @_; @@ -1460,15 +1499,14 @@ SELECT xref_id FROM xref WHERE accession=? AND source_id=?'; sub insert_or_select { - my ($sth, $error, $acc, $source, $species) = @_; + my ($self, $sth, $error, $acc, $source, $species) = @_; my $id; # TODO - check for specific error code rather than for just any error if ($error) { - $id = get_xref_id_by_accession_and_source($acc, $source, $species); -# print STDERR "Got existing xref id " . $id . " for " . $acc . " " . $source . "\n"; + $id = $self->get_xref($acc, $source, $species); } else { @@ -1535,17 +1573,17 @@ sub delete_by_source { # now delete them foreach my $source (keys %source_ids) { - print "Deleting pairs with source ID $source \n"; + print "Deleting pairs with source ID $source \n" if($verbose); $pairs_sth->execute($source); - print "Deleting direct xrefs with source ID $source \n"; + print "Deleting direct xrefs with source ID $source \n" if($verbose); $direct_sth->execute($source); - print "Deleting synonyms of xrefs with source ID $source \n"; + print "Deleting synonyms of xrefs with source ID $source \n" if($verbose); $syn_sth->execute($source); - print "Deleting dependent xrefs of xrefs with source ID $source \n"; + print "Deleting dependent xrefs of xrefs with source ID $source \n" if($verbose); $dep_sth->execute($source); - print "Deleting primary xrefs with source ID $source \n"; + print "Deleting primary xrefs with source ID $source \n" if($verbose); # $p_xref_sth->execute($source); - print "Deleting xrefs with source ID $source \n"; + print "Deleting xrefs with source ID $source \n" if($verbose); $xref_sth->execute($source); } @@ -1569,7 +1607,7 @@ sub validate_sources { my $rv = $sth->execute(lc($source)); if ( $rv > 0 ) { - print "Source $source is valid\n"; + print "Source $source is valid\n" if($verbose); } else { print "\nSource $source is not valid; valid sources are:\n"; show_valid_sources(); @@ -1603,18 +1641,19 @@ sub validate_species { my @species_ids; my $dbi = dbi(); - my $sth = $dbi->prepare("SELECT species_id, name FROM species WHERE LOWER(name)=? OR LOWER(aliases) LIKE ?"); + my $sth = $dbi->prepare("SELECT species_id, name FROM species WHERE LOWER(name)=? OR LOWER(aliases) REGEXP ?"); my ($species_id, $species_name); foreach my $sp (@species) { - $sth->execute(lc($sp), "%" . lc($sp) . "%"); +# $sth->execute(lc($sp), "%" . lc($sp) . "%"); # no longer allow % as this generates tomany possible errors + $sth->execute(lc($sp), "^".lc($sp).",|[ ]".lc($sp)."[,]|^".lc($sp)."\$|[,] ".lc($sp)."\$" ); $sth->bind_columns(\$species_id, \$species_name); if (my @row = $sth->fetchrow_array()) { - print "Species $sp is valid (name = " . $species_name . ", ID = " . $species_id . ")\n"; + print "Species $sp is valid (name = " . $species_name . ", ID = " . $species_id . ")\n" if($verbose); push @species_ids, $species_id; } else { - print "Species $sp is not valid; valid species are:\n"; + print STDERR "Species $sp is not valid; valid species are:\n"; show_valid_species(); exit(1); } @@ -1631,7 +1670,7 @@ sub show_valid_species() { $sth->execute(); while (my @row = $sth->fetchrow_array()) { - print $row[0] . " (aliases: " . $row[1] . ")\n"; + print STDERR $row[0] . " (aliases: " . $row[1] . ")\n"; } } @@ -1669,14 +1708,14 @@ sub get_direct_xref{ sub get_xref{ - my ($self,$acc,$source) = @_; + my ($self,$acc,$source, $species_id) = @_; if(!defined($get_xref_sth)){ - my $sql = "select xref_id from xref where accession = ? and source_id = ?"; + my $sql = "select xref_id from xref where accession = ? and source_id = ? and species_id = ?"; $get_xref_sth = $dbi->prepare($sql); } - $get_xref_sth->execute( $acc, $source ) or croak( $dbi->errstr() ); + $get_xref_sth->execute( $acc, $source, $species_id ) or croak( $dbi->errstr() ); if(my @row = $get_xref_sth->fetchrow_array()) { return $row[0]; } @@ -1688,6 +1727,10 @@ sub add_xref { $species_id ) = @_; + my $xref_id = $self->get_xref($acc,$source_id, $species_id); + if(defined($xref_id)){ + return $xref_id; + } if ( !defined($add_xref_sth) ) { $add_xref_sth = dbi->prepare( "INSERT INTO xref " @@ -1733,14 +1776,14 @@ VALUES } - my $direct_id = $self->get_xref($acc, $source_id); + my $direct_id = $self->get_xref($acc, $source_id, $species_id); if(!defined($direct_id)){ $add_xref_sth->execute( $acc, $version || 0, $label, $description, $source_id, $species_id ) or croak("$acc\t$label\t\t$source_id\t$species_id\n"); } - $direct_id = $self->get_xref($acc, $source_id); + $direct_id = $self->get_xref($acc, $source_id, $species_id); $self->add_direct_xref($direct_id, $direct_xref, $type, ""); } @@ -1772,14 +1815,14 @@ VALUES (?,?,?,?)"); } - my $dependent_id = $self->get_xref($acc, $source_id); + my $dependent_id = $self->get_xref($acc, $source_id, $species_id); if(!defined($dependent_id)){ $add_xref_sth->execute( $acc, $version || 0, $label, $description, $source_id, $species_id ) or croak("$acc\t$label\t\t$source_id\t$species_id\n"); } - $dependent_id = $self->get_xref($acc, $source_id); + $dependent_id = $self->get_xref($acc, $source_id, $species_id); if(!defined($dependent_id)){ croak("$acc\t$label\t\t$source_id\t$species_id\n"); } @@ -1793,14 +1836,14 @@ VALUES } sub add_to_syn_for_mult_sources{ - my ($self, $acc, $sources, $syn) = @_; + my ($self, $acc, $sources, $syn, $species_id) = @_; if(!defined($add_synonym_sth)){ $add_synonym_sth = $dbi->prepare("INSERT INTO synonym VALUES(?,?)"); } my $found =0; foreach my $source_id (@$sources){ - my $xref_id = $self->get_xref($acc, $source_id); + my $xref_id = $self->get_xref($acc, $source_id, $species_id); if(defined($xref_id)){ $add_synonym_sth->execute( $xref_id, $syn ) or croak( $dbi->errstr() . "\n $xref_id\n $syn\n" ); @@ -1816,19 +1859,19 @@ sub add_to_syn_for_mult_sources{ sub add_to_syn{ - my ($self, $acc, $source_id, $syn) = @_; + my ($self, $acc, $source_id, $syn, $species_id) = @_; if(!defined($add_synonym_sth)){ $add_synonym_sth = $dbi->prepare("INSERT INTO synonym VALUES(?,?)"); } - my $xref_id = $self->get_xref($acc, $source_id); + my $xref_id = $self->get_xref($acc, $source_id, $species_id); if(defined($xref_id)){ $add_synonym_sth->execute( $xref_id, $syn ) or croak( $dbi->errstr() . "\n $xref_id\n $syn\n" ); } else { croak( "Could not find acc $acc in " - . "xref table source = $source_id\n" ); + . "xref table source = $source_id of species $species_id\n" ); } } @@ -1910,7 +1953,7 @@ sub create { $ini_file, $metadata_file ); if ( system($cmd) == 0 ) { - print("==> Done.\n"); + print("==> Done.\n") if($verbose); } else { if ( $? == -1 ) { croak("Failed to execute: $!\n"); @@ -1935,7 +1978,7 @@ sub create { if ( $drop_db ) { $dbh->do( "DROP DATABASE $dbname" ); - print "Database $dbname dropped\n" ; + print "Database $dbname dropped\n" if($verbose) ; } if ( $create && !$drop_db ) { @@ -1944,7 +1987,7 @@ sub create { chomp $p; if ($p eq "yes") { $dbh->do( "DROP DATABASE $dbname" ); - print "Removed existing database $dbname\n"; + print "Removed existing database $dbname\n" if($verbose); } else { print "$dbname NOT removed\n"; exit(1); @@ -1959,7 +2002,7 @@ sub create { my $table_file = catfile( $sql_dir, 'sql', 'table.sql' ); - printf( "Creating %s from %s\n", $dbname, $table_file ); + printf( "Creating %s from %s\n", $dbname, $table_file ) if($verbose); if ( !-e $table_file ) { croak( "Cannot open " . $table_file ); } @@ -1970,7 +2013,7 @@ sub create { or croak("Cannot execute the following command (exit $?):\n$cmd\n"); printf( "Populating metadata in %s from %s\n", - $dbname, $metadata_file ); + $dbname, $metadata_file ) if($verbose); if ( !-e $metadata_file ) { croak( "Cannot open " . $metadata_file ); } @@ -2042,7 +2085,7 @@ sub set_release $dbi->prepare( "UPDATE source SET source_release=? WHERE source_id=?"); - print "Setting release to '$release' for source ID '$source_id'\n"; + print "Setting release to '$release' for source ID '$source_id'\n" if($verbose); $sth->execute( $release, $source_id ); } diff --git a/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm b/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm index 29eee0286f..d2e38ad5b2 100644 --- a/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CeleraParser.pm @@ -16,7 +16,15 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my $celera_gene_source_id = $self->get_source_id_for_source_name('Celera_Gene'); @@ -27,7 +35,7 @@ sub run { my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "Could not open $file\n"; + print STDERR "Could not open $file\n"; return 1; } @@ -70,11 +78,10 @@ sub run { $file_io->close(); - print scalar(@xrefs) . " Celera xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); + print scalar(@xrefs) . " Celera xrefs succesfully parsed\n" if($verbose); - print "Done\n"; return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm b/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm index e6fd95fe60..e314556b60 100644 --- a/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/CodelinkParser.pm @@ -14,7 +14,15 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my @xrefs; @@ -22,7 +30,7 @@ sub run { my $codelink_io = $self->get_filehandle($file); if ( !defined $codelink_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 = error } @@ -52,11 +60,11 @@ sub run { $codelink_io->close(); - print scalar(@xrefs) . " Codelink xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); - print "Done\n"; + print scalar(@xrefs) . " Codelink xrefs succesfully parsed\n" if($verbose); + return 0; #successful } diff --git a/misc-scripts/xref_mapping/XrefParser/DBASSParser.pm b/misc-scripts/xref_mapping/XrefParser/DBASSParser.pm index 6086efb561..8c630c9891 100644 --- a/misc-scripts/xref_mapping/XrefParser/DBASSParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/DBASSParser.pm @@ -18,32 +18,28 @@ my $dbi; my $xref_id; my $source_id; -sub new { - my $proto = shift; - my $class = ref $proto || $proto; - my $self = bless {}, $class; - - return $self; -} +sub run { + my $self = shift if (defined(caller(1))); + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $rel_url = shift; + my $verbose = shift; -sub run { - my $self = shift; + my $filename = @{$files}[0]; - my ( $source_id, $species_id, $filename ) = @_; +# my ( $source_id, $species_id, $filename, $rel_url,$verbose ) = @_; my $file_io = $self->get_filehandle($filename); if ( !defined($file_io) ) { return 1; } - my $parsed_count = 0; + my $parsed_count = 0; - printf( STDERR "source = %d\t species = %d\n", - $source_id, $species_id ); - - $file_io->getline(); + $file_io->getline(); while ( defined( my $line = $file_io->getline() ) ) { @@ -53,14 +49,14 @@ sub run { my ( $dbass_gene_id, $dbass_gene_name, $ensembl_id) = split( /,/, $line ); if ( !defined($dbass_gene_id) || !defined($ensembl_id) ) { - printf( "Line %d contains has less than two columns.\n", + printf STDERR ( "Line %d contains has less than two columns.\n", 1 + $parsed_count ); - print ("The parsing failed\n"); + print STDERR ("The parsing failed\n"); return 1; } - my $first_gene_name; + my $first_gene_name = $dbass_gene_name; my $second_gene_name; @@ -71,11 +67,10 @@ sub run { if ($dbass_gene_name =~ /(.*)\((.*)\)/){ $first_gene_name = $1; $second_gene_name = $2; - print $first_gene_name, "\n", $second_gene_name, "\n"; +# print $first_gene_name, "\n", $second_gene_name, "\n" if($verbose); } - my $label = $first_gene_name; my $type = 'gene'; my $description = ''; @@ -84,7 +79,7 @@ sub run { ++$parsed_count; - my $xref_id = XrefParser::BaseParser->get_xref( $dbass_gene_id, $source_id ); + my $xref_id = XrefParser::BaseParser->get_xref( $dbass_gene_id, $source_id, $species_id ); if ( !defined($xref_id) || $xref_id eq '' ) { $xref_id = XrefParser::BaseParser->add_xref($dbass_gene_id, $version, $label, $description, $source_id, $species_id); @@ -96,7 +91,7 @@ sub run { XrefParser::DBASSParser->add_synonym($xref_id, $synonym); } elsif ($synonym =~ /^\s/){ - print "There is white space! \n"; + print "There is white space! \n" if($verbose); } else { next; @@ -105,11 +100,10 @@ sub run { } ## end while ( defined( my $line... - printf( "%d direct xrefs succesfully parsed\n", $parsed_count ); + printf( "%d direct xrefs succesfully parsed\n", $parsed_count ) if($verbose); $file_io->close(); - print "Done\n"; return 0; } ## end sub run diff --git a/misc-scripts/xref_mapping/XrefParser/DirectParser.pm b/misc-scripts/xref_mapping/XrefParser/DirectParser.pm index 5b3686ac38..02ace4883f 100644 --- a/misc-scripts/xref_mapping/XrefParser/DirectParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/DirectParser.pm @@ -63,7 +63,7 @@ sub run { ++$parsed_count; my $xref_id = - XrefParser::BaseParser->get_xref( $accession, $source_id ); + XrefParser::BaseParser->get_xref( $accession, $source_id, $species_id ); if ( !defined($xref_id) || $xref_id eq '' ) { $xref_id = diff --git a/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm b/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm index aae8c38d93..74dbf7e761 100644 --- a/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/EntrezGeneParser.pm @@ -23,9 +23,16 @@ if (!defined(caller())) { sub run { my $self = shift if (defined(caller(1))); + my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; + + if(!defined($source_id)){ $source_id = XrefParser::BaseParser->get_source_id_for_filename($file); @@ -39,7 +46,7 @@ sub run { my $eg_io = $self->get_filehandle($file); if ( !defined $eg_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 is an error } @@ -111,7 +118,7 @@ sub run { my (@syn) = split(/\|/ ,$arr[$gene_synonyms_index]); foreach my $synonym (@syn){ if($synonym ne "-"){ - $self->add_to_syn($acc, $source_id, $synonym); + $self->add_to_syn($acc, $source_id, $synonym, $species_id); $syn_count++; } } @@ -119,7 +126,7 @@ sub run { $eg_io->close(); - print $xref_count." EntrezGene Xrefs added with $syn_count synonyms\n"; + print $xref_count." EntrezGene Xrefs added with $syn_count synonyms\n" if($verbose); return 0; #successful } diff --git a/misc-scripts/xref_mapping/XrefParser/FastaParser.pm b/misc-scripts/xref_mapping/XrefParser/FastaParser.pm index 9c2bade8a4..4cbc9c369e 100644 --- a/misc-scripts/xref_mapping/XrefParser/FastaParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/FastaParser.pm @@ -14,7 +14,16 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; + my $sio = Bio::SeqIO->new(-format=>'fasta' , -file=>$file ); my %species_tax_id = %{$self->get_taxonomy_from_species_id($species_id)}; @@ -44,11 +53,13 @@ sub run { } - print scalar(@xrefs) . " Fasta xrefs succesfully parsed\n"; + print scalar(@xrefs) . " Fasta xrefs succesfully parsed\n" if($verbose); $self->upload_xref_object_graphs(\@xrefs); - print "Done\n"; + print scalar(@xrefs) . " Fasta xrefs succesfully loaded\n" if($verbose); + + return 0; #successful } diff --git a/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm b/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm index c63fc84941..606b58a169 100644 --- a/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/FlybaseParser.pm @@ -8,6 +8,7 @@ use warnings; use Carp; use base qw( XrefParser::BaseParser ); +my $verbose; # The object types we'd like to parse. our %object_types = ( gene => 1, @@ -106,7 +107,7 @@ sub get_source_id_for_source_name { $self->SUPER::get_source_id_for_source_name(@_); printf( "source_id for source '%s' is %d\n", - $source_name, $source_id{$source_name} ); + $source_name, $source_id{$source_name} ) if ($verbose); } if ( !defined( $source_id{$source_name} ) @@ -121,8 +122,18 @@ sub get_source_id_for_source_name { } sub run { - my $self = shift; - my ( $source_id, $species_id, $data_file, $release_file ) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + $verbose = shift; + + my $data_file = @{$files}[0]; + +# my $self = shift; +# my ( $source_id, $species_id, $data_file, $release_file ) = @_; # Fetch hashes of already stored Uniprot and Interpro accessions. my %pre_xref_ids = ( @@ -141,7 +152,7 @@ sub run { printf( "%d lines read, %d skipped, %d parsed; %d lines/s\n", $count_read, $count_skipped, $count_read - $count_skipped, - ( $count_read - $last_count_read )/$status_interval ); + ( $count_read - $last_count_read )/$status_interval ) if($verbose); $last_count_read = $count_read; alarm($status_interval); }; @@ -359,10 +370,10 @@ sub run { } ## end while ( defined( my $line... $data_io->close(); - print("FlybaseParser Summary:\n"); + print("FlybaseParser Summary:\n") if($verbose); foreach my $label ( sort( keys(%xref_ids) ) ) { my $accessions = $xref_ids{$label}; - printf( "\t%-32s %6d\n", $label, scalar( keys( %{$accessions} ) ) ); + printf( "\t%-32s %6d\n", $label, scalar( keys( %{$accessions} ) ) ) if($verbose); } } ## end sub run diff --git a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm index 3c5119b000..3beecdac6b 100644 --- a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm @@ -44,6 +44,7 @@ use base qw( XrefParser::BaseParser ); my %cache_source =(); +my $verbose; # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -175,10 +176,20 @@ sub get_source{ } sub run { + my $self = shift if (defined(caller(1))); + my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files = shift; + my $release_file = shift; + $verbose = shift; + + my $file = @{$files}[0]; +# my $self = shift if (defined(caller(1))); +# my $source_id = shift; +# my $species_id = shift; +# my $file = shift; my $species_name; @@ -202,14 +213,14 @@ sub run { # delete previous if running directly rather than via BaseParser if (!defined(caller(1))) { - print "Deleting previous xrefs for these sources\n"; + print "Deleting previous xrefs for these sources\n" if($verbose); XrefParser::BaseParser->delete_by_source(\@xrefs); } - print "... parsed.\n"; - print STDERR "uploading ".scalar(@xrefs)." xrefs's\n"; + print "... parsed.\n" if($verbose); + print STDERR "uploading ".scalar(@xrefs)." xrefs's\n" if($verbose); XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); - print STDERR "uploading ".scalar(@direct_xrefs)." direct-xrefs's\n"; + print STDERR "uploading ".scalar(@direct_xrefs)." direct-xrefs's\n" if($verbose); XrefParser::BaseParser->upload_direct_xrefs(\@direct_xrefs); return 0; @@ -232,12 +243,12 @@ sub relink_synonyms_to_xrefs{ sub create_xrefs { my ($self, $flybase_source_id, $file) = @_; - print STDERR "starting to parse $file...." ; + print STDERR "starting to parse $file...." if($verbose); my $gff_io = $self->get_filehandle($file); if ( !defined $gff_io ) { - print "ERROR: Can't open the GFF file $file\n"; + print STDERR "ERROR: Can't open the GFF file $file\n"; return 0; } @@ -1079,7 +1090,7 @@ sub get_species { $sth->finish; if (defined $species_name) { - print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n"; + print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n" if($verbose); } else { throw("Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n"); } diff --git a/misc-scripts/xref_mapping/XrefParser/GOParser.pm b/misc-scripts/xref_mapping/XrefParser/GOParser.pm index a5d98fdc71..2c7df6aef8 100644 --- a/misc-scripts/xref_mapping/XrefParser/GOParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/GOParser.pm @@ -25,16 +25,17 @@ if (!defined(caller())) { sub run { my $self = shift if (defined(caller(1))); + my $source_id = shift; my $species_id = shift; - my $file = shift; - my $release_file = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; - my %wrongtype; + my $file = @{$files}[0]; -# if(!defined($source_id)){ -# $source_id = $self->get_source_id_for_filename($file); -# } + + my %wrongtype; #get the "main" GO source id. $source_id = $self->get_source_id_for_source_name("GO","main"); @@ -80,11 +81,11 @@ sub run { my $go_io = $self->get_filehandle($file); if ( !defined $go_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 error } - print "processing for taxon: $tax_id\n"; + print "processing for taxon: $tax_id\n" if($verbose); my $taxon_line = "taxon:".$tax_id; my $miss =0; while ( $_ = $go_io->getline() ) { @@ -142,7 +143,7 @@ sub run { if(defined($worm{$worm_acc})){ my ($xref_id, $stable_id, $type, $link) = split(/::/,$worm{$worm_acc}); - my $new_xref_id=$self->get_xref($array[4],$source_id); + my $new_xref_id=$self->get_xref($array[4],$source_id, $species_id); if(!defined($new_xref_id)){ $new_xref_id = $self->add_xref($array[4],undef,$array[4],"", $source_id, $species_id); @@ -181,7 +182,7 @@ sub run { } elsif(!defined($wrongtype{$array[0]})){ - print STDERR "WARNING: unknown type ".$array[0]."\n"; + print STDERR "WARNING: unknown type ".$array[0]."\n" if($verbose); $wrongtype{$array[0]} = 1; } } @@ -189,7 +190,7 @@ sub run { $go_io->close(); - print "\t$count GO dependent xrefs added $refseq_miss refseq not found and $swiss_miss Swissprot not found \n"; + print "\t$count GO dependent xrefs added $refseq_miss refseq not found and $swiss_miss Swissprot not found \n" if($verbose); } if ( defined $release_file ) { # Parse and set release information from $release_file. @@ -205,7 +206,7 @@ sub run { s#.*The following table describes.*?of (GOA.*?)<ul>.*#$1#; $release =~ s#<[^>]+>##g; - print "GO release: '$release'\n"; + print "GO release: '$release'\n" if($verbose); $self->set_release( $source_id, $release ); } diff --git a/misc-scripts/xref_mapping/XrefParser/HGNCParser.pm b/misc-scripts/xref_mapping/XrefParser/HGNCParser.pm index fc919c9e3a..3b7e93e605 100644 --- a/misc-scripts/xref_mapping/XrefParser/HGNCParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HGNCParser.pm @@ -15,7 +15,7 @@ my $syn_sth; if (!defined(caller())) { if (scalar(@ARGV) != 1) { - print "\nUsage: HGNCParser.pm file <source_id> <species_id>\n\n"; + print STDERR "\nUsage: HGNCParser.pm file <source_id> <species_id>\n\n"; exit(1); } @@ -28,9 +28,12 @@ sub run { my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files_ref = shift; + my $rel_file = shift; + my $verbose = shift; + + my $file = @{$files_ref}[0]; - print STDERR "source = $source_id\tspecies = $species_id\n"; if(!defined($source_id)){ $source_id = XrefParser::BaseParser->get_source_id_for_filename($file); } @@ -68,7 +71,6 @@ sub run { push @list, "refseq_dna"; my (%entrezgene) = %{XrefParser::BaseParser->get_valid_xrefs_for_dependencies("EntrezGene",@list)}; - my $swiss_count = 0; my $refseq_count = 0; my $entrezgene_count = 0; my $ensembl_count = 0; @@ -112,14 +114,14 @@ sub run { if (defined($array[3])) { # dead name, add to synonym my @array2 = split(',\s*', $array[3]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_ensembl_mapped, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_ensembl_mapped, $arr, $species_id); } } if (defined($array[4])) { # alias, add to synonym my @array2 = split(',\s*', $array[4]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_ensembl_mapped, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_ensembl_mapped, $arr, $species_id); } } @@ -133,14 +135,14 @@ sub run { if (defined($array[3])) { # dead name, add to synonym my @array2 = split(',\s*', $array[3]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr, $species_id); } } if (defined($array[4])) { # alias, add to synonym my @array2 = split(',\s*', $array[4]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr, $species_id); } } } @@ -154,14 +156,14 @@ sub run { if (defined($array[3])) { # dead name, add to synonym my @array2 = split(',\s*', $array[3]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr, $species_id); } } if (defined($array[4])) { # alias, add to synonym my @array2 = split(',\s*', $array[4]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr, $species_id); } } } @@ -176,17 +178,17 @@ sub run { if (defined($array[3])) { # dead name, add to synonym my @array2 = split(',\s*', $array[3]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr, $species_id); } } if (defined($array[4])) { # alias, add to synonym my @array2 = split(',\s*', $array[4]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr, $species_id); } } - } + } } if(defined($array[7])){ @@ -198,30 +200,31 @@ sub run { if (defined($array[3])) { # dead name, add to synonym my @array2 = split(',\s*', $array[3]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr, $species_id); } } if (defined($array[4])) { # alias, add to synonym my @array2 = split(',\s*', $array[4]); foreach my $arr (@array2){ - XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr); + XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr, $species_id); } } } } if(!$seen){ # Store to keep descriptions etc $self->add_xref($array[0], "", $array[1], $array[2], $source_id, $species_id); + $mismatch++; } } # while HGNC - $hugo_io->getline(); - - print "Loaded a total of " . ($refseq_count + $entrezgene_count) . " HGNC xrefs, $refseq_count from RefSeq curated mappings and $entrezgene_count from EntrezGene mappings\n"; - - print "$mismatch xrefs could not be associated via RefSeq or EntrezGene\n"; + $hugo_io->close(); + + print "Loaded a total of " . ($refseq_count + $entrezgene_count) . " HGNC xrefs, $refseq_count from RefSeq curated mappings and $entrezgene_count from EntrezGene mappings and $ensembl_count from ensembl_mapping\n" if($verbose); + + print "$mismatch xrefs could not be associated via RefSeq, EntrezGene or ensembl\n" if($verbose); return 0; # successful diff --git a/misc-scripts/xref_mapping/XrefParser/HPAParser.pm b/misc-scripts/xref_mapping/XrefParser/HPAParser.pm index 38465320ea..e77da82af1 100644 --- a/misc-scripts/xref_mapping/XrefParser/HPAParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/HPAParser.pm @@ -14,19 +14,16 @@ use base qw( XrefParser::BaseParser); # 4) Link (URL) -sub new { - my $proto = shift; - - my $class = ref $proto || $proto; - my $self = bless {}, $class; - - return $self; -} - sub run { - my $self = shift; + my $self = shift if (defined(caller(1))); - my ( $source_id, $species_id, $filename ) = @_; + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $rel_url = shift; + my $verbose = shift; + + my $filename = @{$files}[0]; my $file_io = $self->get_filehandle($filename); if ( !defined($file_io) ) { @@ -35,9 +32,6 @@ sub run { my $parsed_count = 0; - printf( STDERR "source = %d\t species = %d\n", - $source_id, $species_id ); - $file_io->getline(); while ( defined( my $line = $file_io->getline() ) ) { @@ -61,7 +55,7 @@ sub run { ++$parsed_count; - my $xref_id = XrefParser::BaseParser->get_xref( $antibody_id, $source_id ); + my $xref_id = XrefParser::BaseParser->get_xref( $antibody_id, $source_id, $species_id ); if ( !defined($xref_id) || $xref_id eq '' ) { $xref_id = XrefParser::BaseParser->add_xref($antibody_id, $version, $label, $description, $source_id, $species_id); @@ -72,12 +66,10 @@ sub run { } ## end while ( defined( my $line... - printf( "%d direct xrefs succesfully parsed\n", $parsed_count ); + printf( "%d direct xrefs succesfully parsed\n", $parsed_count ) if($verbose); $file_io->close(); - print "Done\n"; - return 0; } ## end sub run diff --git a/misc-scripts/xref_mapping/XrefParser/IPIParser.pm b/misc-scripts/xref_mapping/XrefParser/IPIParser.pm index 75f70a6ff1..05b59025e2 100644 --- a/misc-scripts/xref_mapping/XrefParser/IPIParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/IPIParser.pm @@ -13,7 +13,15 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my @xrefs; @@ -22,7 +30,7 @@ sub run { my $ipi_io = $self->get_filehandle($file); if ( !defined $ipi_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 = error } @@ -65,11 +73,11 @@ sub run { $ipi_io->close(); - print scalar(@xrefs) . " IPI xrefs succesfully parsed\n"; XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); - print "Done\n"; + print scalar(@xrefs) . " IPI xrefs succesfully parsed\n" if($verbose); + return 0; #successful } diff --git a/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm b/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm index 55beefae90..b0a1d86724 100644 --- a/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/IlluminaParser.pm @@ -14,14 +14,22 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my @xrefs; my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "Could not open $file\n"; + print STDERR "Could not open $file\n"; return 1; } @@ -63,10 +71,10 @@ sub run { $file_io->close(); - print scalar(@xrefs) . " Illumina V2 xrefs succesfully parsed\n"; - XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); + print scalar(@xrefs) . " Illumina V2 xrefs succesfully parsed\n" if($verbose); + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/IlluminaWGParser.pm b/misc-scripts/xref_mapping/XrefParser/IlluminaWGParser.pm index 12bb2fdf9b..56236e9d76 100644 --- a/misc-scripts/xref_mapping/XrefParser/IlluminaWGParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/IlluminaWGParser.pm @@ -7,14 +7,23 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; + my @xrefs; my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "Could not open $file\n"; + print STDERR "Could not open $file\n"; return 1; } @@ -85,11 +94,11 @@ sub run { $file_io->close(); - print scalar(@xrefs) . " Illumina V2 xrefs succesfully parsed\n"; - XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); + print scalar(@xrefs) . " Illumina V2 xrefs succesfully parsed\n" if($verbose); + return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/InterproGoParser.pm b/misc-scripts/xref_mapping/XrefParser/InterproGoParser.pm index 02ad1bf478..ac3e9abdde 100644 --- a/misc-scripts/xref_mapping/XrefParser/InterproGoParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/InterproGoParser.pm @@ -6,14 +6,22 @@ use vars qw(@ISA); @ISA = qw(XrefParser::BaseParser); sub run { - my( $self, $source_id, $species_id, $file ) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my $file_io = $self->get_filehandle($file) || ( print( "ERROR: Cannot open $file\n" ) && return 1 ); my %interpros = %{$self->get_valid_codes("interpro",$species_id)}; scalar( keys %interpros ) - || ( print( "ERROR: No InterPro xrefs found in DB" ) && return 1 ); + || ( print STDERR "ERROR: No InterPro xrefs found in DB" && return 1 ); #get the "main" GO source id. $source_id = $self->get_source_id_for_source_name("GO","main"); @@ -49,7 +57,7 @@ sub run { } print "Parsed identifiers from $file\n". "\tadded $dependent_xref_count GO xrefs dependent on InterPro\n". - "\tskipped $skip_count GO terms due to missing InterPros\n"; + "\tskipped $skip_count GO terms due to missing InterPros\n" if($verbose); return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/InterproParser.pm b/misc-scripts/xref_mapping/XrefParser/InterproParser.pm index 69a26cbb9c..e4deab7fa5 100644 --- a/misc-scripts/xref_mapping/XrefParser/InterproParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/InterproParser.pm @@ -30,10 +30,13 @@ sub run { my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files_ref = shift; my $release_file = shift; + my $verbose = shift; + + + my $file = @{$files_ref}[0]; - print STDERR "source = $source_id\tspecies = $species_id\n"; if(!defined($source_id)){ $source_id = $self->get_source_id_for_filename($file); } @@ -56,10 +59,10 @@ sub run { . "(accession,version,label,description,source_id,species_id) " . "VALUES(?,?,?,?,?,?)" ); - my $get_xref_sth = - $self->dbi() - ->prepare( "SELECT xref_id FROM xref " - . "WHERE accession = ? AND source_id = ?" ); +# my $get_xref_sth = +# $self->dbi() +# ->prepare( "SELECT xref_id FROM xref " +# . "WHERE accession = ? AND source_id = ?" ); my $dir = dirname($file); @@ -70,7 +73,7 @@ sub run { my $xml_io = $self->get_filehandle($file); if ( !defined $xml_io ) { - print "ERROR: Can't open hugo interpro file $dir/interpro.xml\n"; + print "ERROR: Can't open hugo interpro file $file\n"; return 1; # 1= error } @@ -92,7 +95,7 @@ sub run { if ($interpro) { # print $interpro."\n"; - if ( !get_xref( $get_xref_sth, $interpro, $source_id ) ) { + if ( !$self->get_xref( $interpro, $source_id, $species_id ) ) { $count{INTERPRO}++; if ( !$add_xref_sth->execute( @@ -101,7 +104,7 @@ sub run { ) ) { - print "Problem adding '$interpro'\n"; + print STDERR "Problem adding '$interpro'\n"; return 1; # 1 is an error } } @@ -115,7 +118,7 @@ sub run { my ( $db_type, $id ) = ( $1, $2 ); if( $db_type eq 'SSF' ){ $id =~ s/^SSF// } # Strip SSF prefix - if ( !get_xref( $get_interpro_sth, $interpro, $id ) ) { + if ( !$self->get_xref( $interpro, $id, $species_id ) ) { $add_interpro_sth->execute( $interpro, $id ); $count{$db_type}++; } @@ -126,7 +129,7 @@ sub run { $xml_io->close(); for my $db ( keys %count ) { - print "\t" . $count{$db} . " $db loaded.\n"; + print "\t" . $count{$db} . " $db loaded.\n" if($verbose); } if ( defined $release_file ) { @@ -145,24 +148,24 @@ sub run { $release_io->close(); if ( defined $release ) { - print "Interpro release is '$release'\n"; + print "Interpro release is '$release'\n" if($verbose); $self->set_release( $source_id, $release ); } else { - print "Did not find release info in '$release_file'\n"; + print "Did not find release info in '$release_file'\n" if($verbose); } } return 0; } -sub get_xref{ - my ($get_xref_sth, $acc, $source) = @_; - - $get_xref_sth->execute($acc, $source) || die "FAILED $acc $source\n"; - if(my @row = $get_xref_sth->fetchrow_array()) { - return $row[0]; - } - return 0; -} +#sub get_xref{ +# my ($get_xref_sth, $acc, $source) = @_; +# +# $get_xref_sth->execute($acc, $source) || die "FAILED $acc $source\n"; +# if(my @row = $get_xref_sth->fetchrow_array()) { +# return $row[0]; +# } +# return 0; +#} 1; diff --git a/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm b/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm index f76003ffcc..cb32def5a1 100644 --- a/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/JGI_Parser.pm @@ -13,7 +13,15 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my $source_name = $self->get_source_name_for_source_id ($source_id) ; @@ -50,7 +58,7 @@ sub run { my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 is an error } @@ -88,7 +96,7 @@ sub run { $version = "JGI 2.0" ; }else { - print "WARNING : The source-name specified in the populate_metatable.sql file is\n" . + print STDERR "WARNING : The source-name specified in the populate_metatable.sql file is\n" . "WARNING : not matching the differnt cases specified in JGI_Parser.pm - plese\n" . "WARNING : edit the parser \n" ; return 1; @@ -124,11 +132,10 @@ sub run { $file_io->close(); - print scalar(@xrefs) . " JGI_ xrefs succesfully parsed\n"; + print scalar(@xrefs) . " JGI_ xrefs succesfully parsed\n" if($verbose); XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); - print "Done\n"; return 0; # successful } @@ -138,8 +145,6 @@ sub new my $proto = shift; my $self = $proto->SUPER::new(@_); - print "\n\nhave new jp\n"; - return $self; } diff --git a/misc-scripts/xref_mapping/XrefParser/MGDParser.pm b/misc-scripts/xref_mapping/XrefParser/MGDParser.pm index 7193a88c4d..709aa4c3f8 100644 --- a/misc-scripts/xref_mapping/XrefParser/MGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/MGDParser.pm @@ -31,7 +31,11 @@ sub run { my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files = shift; + my $release_file = shift; + $verbose = shift; + + my $file = @{$files}[0]; die "No longer used. MGI is taken form the uniprot file\n"; @@ -92,7 +96,7 @@ sub run { chomp ; my ($key,$syn) = (split)[0,4]; if(defined($mgi_good{$key})){ - $self->add_to_syn($key, $source_id, $syn); + $self->add_to_syn($key, $source_id, $syn, $species_id); $synonyms++; } } diff --git a/misc-scripts/xref_mapping/XrefParser/MIMParser.pm b/misc-scripts/xref_mapping/XrefParser/MIMParser.pm index b380b7b418..c951a70bd2 100644 --- a/misc-scripts/xref_mapping/XrefParser/MIMParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/MIMParser.pm @@ -26,7 +26,11 @@ sub run { my $general_source_id = shift; my $species_id = shift; - my $file = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my %old_to_new; my %removed; @@ -46,7 +50,7 @@ sub run { my $morbid_source_id = XrefParser::BaseParser->get_source_id_for_source_name("MIM_MORBID"); push @sources, $morbid_source_id; - print "sources are:- ".join(", ",@sources)."\n"; + print "sources are:- ".join(", ",@sources)."\n" if($verbose); local $/ = "*RECORD*"; @@ -115,12 +119,12 @@ sub run { $new = $old_to_new{$new}; } if(!defined($removed{$new})){ - $self->add_to_syn_for_mult_sources($new, \@sources, $old); + $self->add_to_syn_for_mult_sources($new, \@sources, $old, $species_id); $syn_count++; } } - print "$gene genemap and $phenotype phenotype MIM xrefs added\n"; - print "added $syn_count synonyms (defined by MOVED TO)\n"; + print "$gene genemap and $phenotype phenotype MIM xrefs added\n" if($verbose); + print "added $syn_count synonyms (defined by MOVED TO)\n" if($verbose); return 0; #successful } diff --git a/misc-scripts/xref_mapping/XrefParser/RGDParser.pm b/misc-scripts/xref_mapping/XrefParser/RGDParser.pm index 6770c8f2e0..4c8d54fc73 100644 --- a/misc-scripts/xref_mapping/XrefParser/RGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RGDParser.pm @@ -30,7 +30,11 @@ sub run { my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; if(!defined($source_id)){ $source_id = XrefParser::BaseParser->get_source_id_for_filename($file); @@ -104,8 +108,8 @@ sub run { $rgd_io->close(); - print "\t$count xrefs succesfully loaded and dependent on refseq\n"; - print "\t$mismatch xrefs added but with NO dependencies\n"; + print "\t$count xrefs succesfully loaded and dependent on refseq\n" if($verbose); + print "\t$mismatch xrefs added but with NO dependencies\n" if($verbose); return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm index d67626acb0..179e5d4ff8 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm @@ -24,13 +24,20 @@ if (!defined(caller())) { # -------------------------------------------------------------------------------- +my $verbose; + sub run { my $self = shift if (defined(caller(1))); my $source_id = shift; my $species_id = shift; - my @files = @_; + my $files_ref = shift; + my $rel_file = shift; + $verbose = shift; + + my @files = @{$files_ref}; + my $release_file; @@ -43,8 +50,8 @@ sub run { my $dna_source_id = $self->get_source_id_for_source_name('RefSeq_dna'); - print "RefSeq_peptide source ID = $peptide_source_id\n"; - print "RefSeq_dna source ID = $dna_source_id\n"; + print "RefSeq_peptide source ID = $peptide_source_id\n" if($verbose); + print "RefSeq_dna source ID = $dna_source_id\n" if($verbose); my $pred_peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide_predicted'); @@ -52,8 +59,8 @@ sub run { $self->get_source_id_for_source_name('RefSeq_dna_predicted'); print "RefSeq_peptide_predicted source ID = " - . "$pred_peptide_source_id\n"; - print "RefSeq_dna_predicted source ID = $pred_dna_source_id\n"; + . "$pred_peptide_source_id\n" if($verbose); + print "RefSeq_dna_predicted source ID = $pred_dna_source_id\n" if($verbose); my @xrefs; foreach my $file (@files) { @@ -96,7 +103,7 @@ sub run { # Put a comma after the release number to make it more readable. $release =~ s/Release (\d+)/Release $1,/; - print "RefSeq release: '$release'\n"; + print "RefSeq release: '$release'\n" if($verbose); $self->set_release( $source_id, $release ); $self->set_release( $peptide_source_id, $release ); @@ -128,18 +135,13 @@ sub create_xrefs { my @tax_ids = @{$species2tax{$species_id}}; my %name2species_id = map{ $_=>$species_id } @names; my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids; - # my %name2species_id = $self->name2species_id(); - # my %taxonomy2species_id = $self->taxonomy2species_id(); my %dependent_sources = $self->get_dependent_xref_sources(); -# my (%genemap) = %{$self->get_valid_codes("mim_gene",$species_id)}; -# my (%morbidmap) = %{$self->get_valid_codes("mim_morbid",$species_id)}; - my $refseq_io = $self->get_filehandle($file); if ( !defined $refseq_io ) { - print "ERROR: Can't open RefSeqGPFF file $file\n"; + print STDERR "ERROR: Can't open RefSeqGPFF file $file\n"; return undef; } @@ -165,7 +167,7 @@ sub create_xrefs { $type = 'peptide'; }else{ - print "Could not work out sequence type for $file\n"; + print STDERR "Could not work out sequence type for $file\n"; return undef; } @@ -293,7 +295,7 @@ sub create_xrefs { $refseq_io->close(); - print "Read " . scalar(@xrefs) ." xrefs from $file\n"; + print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose); return \@xrefs; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm index 6132364af5..1f1c57799e 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm @@ -8,6 +8,8 @@ use File::Basename; use base qw( XrefParser::BaseParser ); + +my $verbose; # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -28,10 +30,13 @@ sub run { my $self = shift if (defined(caller(1))); - my $source_id = shift; + my $source_id = shift; my $species_id = shift; - my @files = @_; + my $files_ref = shift; + my $rel_file = shift; + $verbose = shift; + my @files = @{$files_ref}; my $release_file; if ( $files[-1] =~ /RefSeq-release/ ) { @@ -43,8 +48,8 @@ sub run { my $dna_source_id = $self->get_source_id_for_source_name('RefSeq_dna'); - print "RefSeq_peptide source ID = $peptide_source_id\n"; - print "RefSeq_dna source ID = $dna_source_id\n"; + print "RefSeq_peptide source ID = $peptide_source_id\n" if($verbose); + print "RefSeq_dna source ID = $dna_source_id\n" if($verbose); my $pred_peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide_predicted'); @@ -52,8 +57,8 @@ sub run { $self->get_source_id_for_source_name('RefSeq_dna_predicted'); print "RefSeq_peptide_predicted source ID = " - . "$pred_peptide_source_id\n"; - print "RefSeq_dna_predicted source ID = $pred_dna_source_id\n"; + . "$pred_peptide_source_id\n" if($verbose); + print "RefSeq_dna_predicted source ID = $pred_dna_source_id\n" if($verbose); my @xrefs; foreach my $file (@files) { @@ -128,7 +133,7 @@ sub create_xrefs { my $refseq_io = $self->get_filehandle($file); if ( !defined $refseq_io ) { - print "ERROR: Can't open RefSeq file $file\n"; + print STDERR "ERROR: Can't open RefSeq file $file\n"; return undef; } @@ -205,7 +210,7 @@ sub create_xrefs { $refseq_io->close(); - print "Read " . scalar(@xrefs) ." xrefs from $file\n"; + print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose); return \@xrefs; diff --git a/misc-scripts/xref_mapping/XrefParser/SGDParser.pm b/misc-scripts/xref_mapping/XrefParser/SGDParser.pm index 5c20c87410..76fd234b89 100644 --- a/misc-scripts/xref_mapping/XrefParser/SGDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/SGDParser.pm @@ -26,7 +26,11 @@ sub run { my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files_ref = shift; + my $rel_file = shift; + my $verbose = shift; + + my $file = @{$files_ref}[0]; if(!defined($source_id)){ $source_id = XrefParser::BaseParser->get_source_id_for_filename($file); @@ -35,12 +39,12 @@ sub run { $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); } - + my $sgd_io = $self->get_filehandle($file); if ( !defined $sgd_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 is an error } @@ -55,14 +59,14 @@ sub run { $self->add_xref($sgd_id,"",$locus_name,$desc,$source_id,$species_id); $xref_count++; foreach my $synonym (@syn){ - $self->add_to_syn($sgd_id, $source_id, $synonym); + $self->add_to_syn($sgd_id, $source_id, $synonym, $species_id); $syn_count++; } } $sgd_io->close(); - print $xref_count." SGD Xrefs added with $syn_count synonyms\n"; + print $xref_count." SGD Xrefs added with $syn_count synonyms\n" if($verbose); return 0; #successful } diff --git a/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm b/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm index 501f4f6322..24d33f5198 100644 --- a/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/SegmentParser.pm @@ -25,7 +25,12 @@ sub run { my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files_ref = shift; + my $rel_file = shift; + my $verbose = shift; + + my $file = @{$files_ref}[0]; + if(!defined($source_id)){ $source_id = XrefParser::BaseParser->get_source_id_for_filename($file); @@ -40,7 +45,7 @@ sub run { my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "ERROR: Could not open file $file\n"; + print STDERR "ERROR: Could not open file $file\n"; return 1; } @@ -58,7 +63,7 @@ sub run { } $name_2_source_id{$source_name} = $tmp; } - my $xref_id = $self->get_xref($acc,$name_2_source_id{$source_name}); + my $xref_id = $self->get_xref($acc,$name_2_source_id{$source_name}, $species_id); if(!defined($xref_id)){ $xref_id = $self->add_xref($acc,"",$display_label,$description,$name_2_source_id{$source_name}, $species_id); $added++; @@ -72,7 +77,7 @@ sub run { $file_io->close(); - print "Added $added Xrefs for Gene segments\n"; + print "Added $added Xrefs for Gene segments\n" if($verbose); return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/UCSCParser.pm b/misc-scripts/xref_mapping/XrefParser/UCSCParser.pm index 84258f5653..617ed75ca8 100644 --- a/misc-scripts/xref_mapping/XrefParser/UCSCParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UCSCParser.pm @@ -8,8 +8,15 @@ use warnings; use base qw( XrefParser::CoordinateParser ); sub run { - my $self = shift; - my ( $source_id, $species_id, $data_file, $release_file ) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $data_file = @{$files}[0]; # Get the $source_id for the "UCSC" source. $source_id = $self->get_source_id_for_source_name('UCSC'); diff --git a/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm b/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm index d53e9fde3d..424d85cdec 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniGeneParser.pm @@ -8,6 +8,8 @@ use File::Basename; use base qw( XrefParser::BaseParser ); +my $verbose; + # -------------------------------------------------------------------------------- # Parse command line and run if being run directly @@ -30,13 +32,16 @@ sub run { my $source_id = shift; my $species_id = shift; - my $uniq_file = shift; - my $data_file = shift; - my $release_file = shift; + my $files = shift; + my $release_file = shift; + $verbose = shift; + + my $uniq_file = @{$files}[0]; + my $data_file = @{$files}[1]; my $unigene_source_id = $self->get_source_id_for_source_name('UniGene'); - print "UniGene source ID = $unigene_source_id.\n"; + print "UniGene source ID = $unigene_source_id.\n" if($verbose); if ( !defined($species_id) ) { $species_id = @@ -83,8 +88,7 @@ sub run { $release =~ s/\s{2,}/ /g; $release =~ s/^(.*) UniGene/$1, UniGene/; - print "UniGene release: '$release'\n"; - #$self->set_release( $source_id, $release ); + print "UniGene release: '$release'\n" if($verbose); $self->set_release( $unigene_source_id, $release ); } } @@ -107,7 +111,7 @@ sub get_desc{ my $desc_io = $self->get_filehandle( $data_file ); if ( !defined $desc_io ) { - print "ERROR: Can't open $data_file\n"; + print STDERR "ERROR: Can't open $data_file\n"; return undef; } @@ -149,7 +153,7 @@ sub create_xrefs { my $unigene_io = $self->get_filehandle($uniq_file); if ( !defined $unigene_io ) { - print "Can't open RefSeq file $uniq_file\n"; + print STDERR "Can't open RefSeq file $uniq_file\n"; return undef; } @@ -205,7 +209,7 @@ sub create_xrefs { $unigene_io->close(); %geneid_2_desc=(); - print "Read " . scalar(@xrefs) ." xrefs from $uniq_file\n"; + print "Read " . scalar(@xrefs) ." xrefs from $uniq_file\n" if($verbose); return \@xrefs; diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm index e322b60b66..eba95aabb5 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm @@ -15,14 +15,17 @@ use File::Basename; use base qw( XrefParser::BaseParser ); + +my $verbose; + # -------------------------------------------------------------------------------- # Parse command line and run if being run directly if (!defined(caller())) { if (scalar(@ARGV) != 3) { - print "\nUsage: UniProtParser.pm file.SPC <source_id> <species_id>\n\n"; - print scalar(@ARGV); + print STDERR "\nUsage: UniProtParser.pm file.SPC <source_id> <species_id>\n\n"; + print STDERR scalar(@ARGV); exit(1); } @@ -38,8 +41,11 @@ sub run { my $source_id = shift; my $species_id = shift; - my $file = shift; - my $release_file = shift; + my $files = shift; + my $release_file = shift; + $verbose = shift; + + my $file = @{$files}[0]; my $species_name; @@ -54,8 +60,8 @@ sub run { $sptr_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL'); - print "SwissProt source id for $file: $sp_source_id\n"; - print "SpTREMBL source id for $file: $sptr_source_id\n"; + print "SwissProt source id for $file: $sp_source_id\n" if ($verbose); + print "SpTREMBL source id for $file: $sptr_source_id\n" if ($verbose); my @xrefs = @@ -68,7 +74,7 @@ sub run { # delete previous if running directly rather than via BaseParser if (!defined(caller(1))) { - print "Deleting previous xrefs for these sources\n"; + print "Deleting previous xrefs for these sources\n" if($verbose); $self->delete_by_source(\@xrefs); } @@ -93,10 +99,10 @@ sub run { while ( defined( my $line = $release_io->getline() ) ) { if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) { $sp_release = $1; - print "Swiss-Prot release is '$sp_release'\n"; + print "Swiss-Prot release is '$sp_release'\n" if($verbose); } elsif ( $line =~ m#(UniProtKB/TrEMBL Release .*)# ) { $sptr_release = $1; - print "SpTrEMBL release is '$sptr_release'\n"; + print "SpTrEMBL release is '$sptr_release'\n" if($verbose); } } $release_io->close(); @@ -133,11 +139,11 @@ sub get_species { if (defined $species_name) { - print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n"; + print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n" if($verbose); } else { - print "Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n"; + print STDERR "Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n"; exit(1); } @@ -178,10 +184,10 @@ sub create_xrefs { # my $go_source_id = $self->get_source_id_for_source_name('GO'); my $embl_pred_source_id = $dependent_sources{'EMBL_predicted'}; my $protein_id_pred_source_id = $dependent_sources{'protein_id_predicted'}; - print "Predicted SwissProt source id for $file: $sp_pred_source_id\n"; - print "Predicted SpTREMBL source id for $file: $sptr_pred_source_id\n"; - print "Predicted EMBL source id for $file: $embl_pred_source_id\n"; - print "Predicted protein_id source id for $file: $protein_id_pred_source_id\n"; + print "Predicted SwissProt source id for $file: $sp_pred_source_id\n" if($verbose); + print "Predicted SpTREMBL source id for $file: $sptr_pred_source_id\n" if($verbose); + print "Predicted EMBL source id for $file: $embl_pred_source_id\n" if($verbose); + print "Predicted protein_id source id for $file: $protein_id_pred_source_id\n" if($verbose); # print "GO source id for $file: $go_source_id\n"; my (%genemap) = @@ -424,12 +430,12 @@ sub create_xrefs { $uniprot_io->close(); - print "Read $num_sp SwissProt xrefs and $num_sptr SPTrEMBL xrefs from $file\n"; - print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if ($num_sp_pred > 0 || $num_sptr_pred > 0); + print "Read $num_sp SwissProt xrefs and $num_sptr SPTrEMBL xrefs from $file\n" if($verbose); + print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose); - print "Added the following dependent xrefs:-\n"; + print "Added the following dependent xrefs:-\n" if($verbose); foreach my $key (keys %dependent_xrefs){ - print $key."\t".$dependent_xrefs{$key}."\n"; + print $key."\t".$dependent_xrefs{$key}."\n" if($verbose); } return \@xrefs; diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm b/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm index cae047405d..fe5e8cd13d 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtParser_descriptions_only.pm @@ -21,8 +21,8 @@ use base qw( XrefParser::BaseParser ); if (!defined(caller())) { if (scalar(@ARGV) != 3) { - print "\nUsage: UniProtParser.pm file.SPC <source_id> <species_id>\n\n"; - print scalar(@ARGV); + print STDERR "\nUsage: UniProtParser.pm file.SPC <source_id> <species_id>\n\n"; + print STDERR scalar(@ARGV); exit(1); } @@ -30,16 +30,21 @@ if (!defined(caller())) { } +my $verbose; + # -------------------------------------------------------------------------------- sub run { my $self = shift if (defined(caller(1))); - my $source_id = shift; - my $species_id = shift; - my $file = shift; + my $source_id = shift; + my $species_id = shift; + my $files = shift; my $release_file = shift; + $verbose = shift; + + my $file = @{$files}[0]; my $species_name; @@ -54,8 +59,8 @@ sub run { $sptr_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL'); - print "SwissProt source id for $file: $sp_source_id\n"; - print "SpTREMBL source id for $file: $sptr_source_id\n"; + print "SwissProt source id for $file: $sp_source_id\n" if($verbose); + print "SpTREMBL source id for $file: $sptr_source_id\n" if($verbose); my @xrefs = @@ -68,7 +73,7 @@ sub run { # delete previous if running directly rather than via BaseParser if (!defined(caller(1))) { - print "Deleting previous xrefs for these sources\n"; + print "Deleting previous xrefs for these sources\n" if($verbose); $self->delete_by_source(\@xrefs); } @@ -93,10 +98,10 @@ sub run { while ( defined( my $line = $release_io->getline() ) ) { if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) { $sp_release = $1; - print "Swiss-Prot release is '$sp_release'\n"; + print "Swiss-Prot release is '$sp_release'\n" if($verbose); } elsif ( $line =~ m#(UniProtKB/TrEMBL Release .*)# ) { $sptr_release = $1; - print "SpTrEMBL release is '$sptr_release'\n"; + print "SpTrEMBL release is '$sptr_release'\n" if($verbose); } } $release_io->close(); @@ -133,11 +138,11 @@ sub get_species { if (defined $species_name) { - print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n"; + print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n" if($verbose); } else { - print "Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n"; + print STDERR "Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n"; exit(1); } @@ -168,8 +173,8 @@ sub create_xrefs { $self->get_source_id_for_source_name( 'Uniprot/SPTREMBL_predicted'); - print "Predicted SwissProt source id for $file: $sp_pred_source_id\n"; - print "Predicted SpTREMBL source id for $file: $sptr_pred_source_id\n"; + print "Predicted SwissProt source id for $file: $sp_pred_source_id\n" if($verbose); + print "Predicted SpTREMBL source id for $file: $sptr_pred_source_id\n" if($verbose); my $uniprot_io = $self->get_filehandle($file); if ( !defined $uniprot_io ) { return undef } @@ -295,8 +300,8 @@ sub create_xrefs { $uniprot_io->close(); - print "Read $num_sp SwissProt xrefs and $num_sptr SPTrEMBL xrefs from $file\n"; - print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if ($num_sp_pred > 0 || $num_sptr_pred > 0); + print "Read $num_sp SwissProt xrefs and $num_sptr SPTrEMBL xrefs from $file\n" if($verbose); + print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose); return \@xrefs; diff --git a/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm b/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm index 9b3a6dad7d..2ac20eb65c 100644 --- a/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/UniProtVarSplicParser.pm @@ -18,7 +18,15 @@ use base qw( XrefParser::BaseParser ); sub run { - my ( $self, $source_id, $species_id, $file, $release_file ) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my @xrefs; @@ -27,12 +35,13 @@ sub run { my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 error } my %swiss = %{ $self->get_valid_codes( "uniprot", $species_id ) }; + print scalar(%swiss)." uniprot entries will be used as tests\n" if($verbose); my $missed = 0; while ( $_ = $file_io->getline() ) { my $xref; @@ -68,8 +77,8 @@ sub run { $file_io->close(); - print $missed." ignored as original uniprot not found in database\n"; - print scalar(@xrefs) . " UniProtVarSplic xrefs succesfully parsed\n"; + print $missed." ignored as original uniprot not found in database\n" if($verbose); + print scalar(@xrefs) . " UniProtVarSplic xrefs succesfully parsed\n" if($verbose); $self->upload_xref_object_graphs(\@xrefs); @@ -79,7 +88,7 @@ sub run { my $release_io = $self->get_filehandle($release_file); while ( defined( my $line = $release_io->getline() ) ) { if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) { - print "Swiss-Prot release is '$1'\n"; + print "Swiss-Prot release is '$1'\n" if($verbose); $self->set_release( $source_id, $1 ); } } @@ -87,7 +96,6 @@ sub run { } - print "Done\n"; return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/VbDirectParser.pm b/misc-scripts/xref_mapping/XrefParser/VbDirectParser.pm index 91e8ef7508..e9d3b88451 100644 --- a/misc-scripts/xref_mapping/XrefParser/VbDirectParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/VbDirectParser.pm @@ -23,33 +23,25 @@ sub run { my $i=0; print STDERR "source: ".$source_id."\tspecies: ".$species_id."\n"; -# while ($ln = <INFILE>) { -my $type = "transcript"; + my $type = "transcript"; while (my $ln = <INFILE>) { -# print STDERR $ln."\t!\n"; chomp($ln); my ($probe,$id, $version, $description, $ensembl_id) = split("\t",$ln); $i++; - my $xref_id = XrefParser::BaseParser->get_xref($probe, $source_id); + my $xref_id = XrefParser::BaseParser->get_xref($probe, $source_id, $species_id); if (!defined($xref_id) || $xref_id eq "") { -# print STDERR "adding xref\n"; $xref_id = XrefParser::BaseParser->add_xref($probe, 1, $probe, $description, $source_id, $species_id); } -# else { -# print "xref found in DB\t".$xref_id."\n"; -# } -# print "xref = ".$xref_id."\n"; XrefParser::BaseParser->add_direct_xref($xref_id, $ensembl_id, $type, $probe); } - print $i." VB direct xrefs succesfully parsed\n"; + print $i." VB direct xrefs succesfully parsed\n" if($verbose); close(INFILE); - print "Done\n"; return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/VbGFF3Parser.pm b/misc-scripts/xref_mapping/XrefParser/VbGFF3Parser.pm index f154b74399..950efaa284 100644 --- a/misc-scripts/xref_mapping/XrefParser/VbGFF3Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/VbGFF3Parser.pm @@ -12,10 +12,18 @@ use base qw( XrefParser::CoordinateParser ); # Parser for GFF3-format probe mappings from Vectorbase sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; + open (INFILE, "<$file"); my $i=0; my $type = "transcript"; - print STDERR "source: ".$source_id."\tspecies: ".$species_id."\n"; while (my $ln = <INFILE>) { # # parse GFF line: chomp($ln); $i++; @@ -62,12 +70,12 @@ sub run { 'cdsEnd' => $end, 'exonStarts' => $start, 'exonEnds' => $end ); - print STDERR "$name, $start, $end\n"; + print STDERR "$name, $start, $end\n" if($verbose); $self->add_xref( $source_id, $species_id, \%xref ); } } } - print STDERR $i." VB GFF3 xrefs succesfully parsed\n"; + print STDERR $i." VB GFF3 xrefs succesfully parsed\n" if($verbose); close(INFILE); return 0; diff --git a/misc-scripts/xref_mapping/XrefParser/VegaParser.pm b/misc-scripts/xref_mapping/XrefParser/VegaParser.pm index 314a0911ea..2202b91874 100644 --- a/misc-scripts/xref_mapping/XrefParser/VegaParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/VegaParser.pm @@ -22,17 +22,24 @@ use base qw( XrefParser::BaseParser ); sub run { - my $self = shift; - my ( $source_id, $species_id, $file_name ) = @_; - - my $file_io = $self->get_filehandle($file_name); - - if ( !defined $file_io ) { - return 1; # Failed. - } - - my @xrefs; - while ( defined( my $line = $file_io->getline() ) ) { + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files_ref = shift; + my $rel_file = shift; + my $verbose = shift; + + my $file_name = @{$files_ref}[0]; + + my $file_io = $self->get_filehandle($file_name); + + if ( !defined $file_io ) { + return 1; # Failed. + } + + my @xrefs; + while ( defined( my $line = $file_io->getline() ) ) { chomp $line; if ( substr( $line, 0, 1 ) eq '>' ) { @@ -62,11 +69,10 @@ sub run } } - print scalar(@xrefs) . " Vega Fasta Xrefs successfully parsed\n"; $self->upload_xref_object_graphs( \@xrefs ); - print "Done\n"; + print scalar(@xrefs) . " Vega Fasta Xrefs successfully parsed\n" if($verbose); return 0; # Successful. } diff --git a/misc-scripts/xref_mapping/XrefParser/Vega_TranParser.pm b/misc-scripts/xref_mapping/XrefParser/Vega_TranParser.pm index 4670b88c1f..5b7d3b6f55 100644 --- a/misc-scripts/xref_mapping/XrefParser/Vega_TranParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/Vega_TranParser.pm @@ -13,6 +13,8 @@ sub run { my ($self, $source_id, $species_id, $file) = @_; + die "No longer used try HGNC_curated_transcript\n"; + my $vega_io = $self->get_filehandle($file); my $clone_source_id = @@ -21,7 +23,7 @@ sub run { $self->get_source_id_for_source_name('HGNC_curated_transcript'); if ( !defined $vega_io ) { - print "Could not open $file\n"; + print STDERR "Could not open $file\n"; return 1; } @@ -55,7 +57,7 @@ sub run { $self->add_direct_xref($xref_id, $stable_id, "transcript", ""); } - print "Parsed $line_count lines from $file, added $xref_count xrefs and $xref_count direct_xrefs\n"; + print "Parsed $line_count lines from $file, added $xref_count xrefs and $xref_count direct_xrefs\n" if($verbose); $vega_io->close(); diff --git a/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm b/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm index b2025e2850..9eb1ee65e6 100644 --- a/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WilsonAffyParser.pm @@ -10,9 +10,16 @@ my $syn_sth; sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); - my @xrefs = $self->create_xrefs($source_id, $species_id, $file); + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; +# my ($self, $source_id, $species_id, $file) = @_; + + my @xrefs = $self->create_xrefs($source_id, $species_id, @{$files}[0], $verbose); if(!@xrefs){ return 1; # 1 error @@ -27,7 +34,7 @@ sub run { sub create_xrefs { - my ($self, $source_id, $species_id, $file) = @_; + my ($self, $source_id, $species_id, $file, $verbose) = @_; my ($count, $noseq, $direct) = (0,0,0); @@ -38,7 +45,7 @@ sub create_xrefs { my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 error } @@ -98,17 +105,15 @@ sub create_xrefs { # Add description noting where the mapping came from $xref->{DESCRIPTION} = $target . " used as mapping target"; - #print $xref->{ACCESSION} . " " . $target . " " . $? . "\n"; - $count++; - print "$count " if ($count % 100 == 0); + print "$count " if (($count % 100 == 0) and $verbose); push @xrefs, $xref; } else { - print "Couldn't get sequence for $target\n"; + print STDERR "Couldn't get sequence for $target\n"; $noseq++; } @@ -119,9 +124,11 @@ sub create_xrefs { $file_io->close(); - print "\n\nParsed $count primary xrefs.\n"; - print "Couldn't get sequence for $noseq primary_xrefs\n" if ($noseq); - print "Added $direct direct xrefs.\n"; + if($verbose){ + print "\n\nParsed $count primary xrefs.\n"; + print "Couldn't get sequence for $noseq primary_xrefs\n" if ($noseq); + print "Added $direct direct xrefs.\n"; + } return \@xrefs; diff --git a/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm b/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm index 522c3544ed..e86d288af1 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormPepParser.pm @@ -17,14 +17,20 @@ my $dep_sth; sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; - print STDERR "WORMPep source = $source_id\tspecies = $species_id\n"; my $worm_source_id = XrefParser::BaseParser->get_source_id_for_source_name('wormpep_id'); my $worm_locus_id = XrefParser::BaseParser->get_source_id_for_source_name('wormbase_locus'); - print STDERR "source = $worm_source_id, locus = $worm_locus_id.\n"; my $xref_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$worm_source_id AND species_id=$species_id"); my $xref_sth2 = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$worm_locus_id AND species_id=$species_id"); @@ -32,7 +38,7 @@ sub run { my $pep_io = $self->get_filehandle($file); if ( !defined $pep_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 error } @@ -69,7 +75,7 @@ sub run { $pep_io->close(); - print "Added $d_count direct xrefs and $x_count xrefs\n"; + print "Added $d_count direct xrefs and $x_count xrefs\n" if($verbose); return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm index 92d43cabed..a26e3f5b21 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseDatabaseStableIDParser.pm @@ -10,8 +10,7 @@ use strict; use base qw( XrefParser::DatabaseParser ); sub run { - - my ($self, $dsn, $source_id, $species_id) = @_; + my ($self, $dsn, $source_id, $species_id, $verbose) = @_; my $db = $self->connect($dsn); # source db (probably core) my $xref_db = $self->dbi(); @@ -22,7 +21,7 @@ sub run { # read stable IDs foreach my $type ('gene', 'transcript') { - print "Building xrefs from $type stable IDs\n"; + print "Building xrefs from $type stable IDs\n" if($verbose); my $wb_source_id = $self->get_source_id_for_source_name("wormbase_$type"); diff --git a/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm b/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm index c9b1535209..38280dd582 100644 --- a/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/XenopusJamboreeParser.pm @@ -15,7 +15,15 @@ use base qw( XrefParser::BaseParser ); sub run { - my ($self, $source_id, $species_id, $file) = @_; + my $self = shift if (defined(caller(1))); + + my $source_id = shift; + my $species_id = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; my @xrefs; @@ -24,7 +32,7 @@ sub run { my $file_io = $self->getline($file); if ( !defined $file_io ) { - print "ERROR: Could not open $file\n"; + print STDERR "ERROR: Could not open $file\n"; return 1; # 1 error } @@ -56,13 +64,12 @@ sub run { $file_io->close(); - print scalar(@xrefs) . " XenopusJamboreeParser xrefs succesfully parsed\n"; + print scalar(@xrefs) . " XenopusJamboreeParser xrefs succesfully parsed\n" if($verbose); if(!defined(XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs))){ return 1; #1 error } - print "Done\n"; return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm b/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm index 6dae3ce2b1..a235f1040a 100644 --- a/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/ZFINParser.pm @@ -13,7 +13,7 @@ use base qw( XrefParser::BaseParser ); if (!defined(caller())) { if (scalar(@ARGV) != 1) { - print "\nUsage: ZFINParser.pm file <source_id> <species_id>\n\n"; + print STDERR "\nUsage: ZFINParser.pm file <source_id> <species_id>\n\n"; exit(1); } @@ -21,13 +21,18 @@ if (!defined(caller())) { } + sub run { my $self = shift if (defined(caller(1))); my $source_id = shift; my $species_id = shift; - my $file = shift; + my $files = shift; + my $release_file = shift; + my $verbose = shift; + + my $file = @{$files}[0]; if(!defined($source_id)){ $source_id = XrefParser::BaseParser->get_source_id_for_filename($file); @@ -46,8 +51,7 @@ sub run { $self->get_filehandle( catfile( $dir, 'uniprot.txt' ) ); if ( !defined $swissprot_io ) { - print( "ERROR: Could not open " . catfile( $dir, 'uniprot.txt' ), - "\n" ); + print STDERR "ERROR: Could not open " . catfile( $dir, 'uniprot.txt' ). "\n" ; return 1; # 1 error } @@ -78,8 +82,7 @@ sub run { my $refseq_io = $self->get_filehandle( catfile( $dir, 'refseq.txt' ) ); if ( !defined $refseq_io ) { - print( "ERROR: Could not open " . catfile( $dir, 'refseq.txt' ), - "\n" ); + print STDERR "ERROR: Could not open " . catfile( $dir, 'refseq.txt' ),"\n" ; return 1; } @@ -106,8 +109,7 @@ sub run { my $zfin_io = $self->get_filehandle( catfile( $dir, 'aliases.txt' ) ); if ( !defined $zfin_io ) { - print( "ERROR: Could not open " . catfile( $dir, 'aliases.txt' ), - "\n" ); + print STDERR "ERROR: Could not open " . catfile( $dir, 'aliases.txt' ), "\n" ; return 1; } @@ -120,17 +122,19 @@ sub run { chomp; my ($acc, undef, undef, $syn) = split (/\t/,$_); if(defined($zfin{$acc})){ - XrefParser::BaseParser->add_to_syn($acc, $source_id, $syn); + XrefParser::BaseParser->add_to_syn($acc, $source_id, $syn, $species_id); $syncount++; } } $zfin_io->close(); - print "\t$spcount xrefs from UniProt and\n"; - print "\t$rscount xrefs from RefSeq succesfully loaded\n"; - print "\t$syncount synonyms loaded\n"; - print "\t$mismatch xrefs ignored\n"; + if($verbose){ + print "\t$spcount xrefs from UniProt and\n"; + print "\t$rscount xrefs from RefSeq succesfully loaded\n"; + print "\t$syncount synonyms loaded\n"; + print "\t$mismatch xrefs ignored\n"; + } return 0; } diff --git a/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm b/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm index a0478ba73a..9fc2ce5e67 100644 --- a/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/ncRNAParser.pm @@ -12,7 +12,7 @@ use base qw( XrefParser::BaseParser ); if (!defined(caller())) { if (scalar(@ARGV) != 1) { - print "\nUsage: ncRNAParser.pm file <source_id> <species_id>\n\n"; + print STDERR "\nUsage: ncRNAParser.pm file <source_id> <species_id>\n\n"; exit(1); } @@ -41,7 +41,7 @@ sub run { my $file_io = $self->get_filehandle($file); if ( !defined $file_io ) { - print "ERROR: Could not open file $file\n"; + print STDERR "ERROR: Could not open file $file\n"; return 1; } @@ -67,7 +67,7 @@ sub run { } $name_2_source_id{$source_name} = $tmp; } - my $xref_id = $self->get_xref($acc,$name_2_source_id{$source_name}); + my $xref_id = $self->get_xref($acc,$name_2_source_id{$source_name}, $species_id); if(!defined($xref_id)){ $xref_id = $self->add_xref($acc,"",$display_label,$description,$name_2_source_id{$source_name}, $species_id); $added++; @@ -82,7 +82,7 @@ sub run { $file_io->close(); - print "Added $added Xrefs for ncRNAs\n"; + print "Added $added Xrefs for ncRNAs\n" if($verbose); return 0; } -- GitLab