diff --git a/misc-scripts/id_mapping/fake_stable_id_mapping.pl b/misc-scripts/id_mapping/fake_stable_id_mapping.pl index 814bd53d4d9e95f71da6c80cd9c776bae467c569..ce0eedc27b791b3fdaca751f90c4260721be62d0 100755 --- a/misc-scripts/id_mapping/fake_stable_id_mapping.pl +++ b/misc-scripts/id_mapping/fake_stable_id_mapping.pl @@ -25,7 +25,6 @@ Options: --altuser=USER use old database username USER --altpass=PASS use old database passwort PASS - --mapping_session_id|msi=ID latest mapping session --gene_stable_id_file|gsi_file|gsi=FILE the path of the file containing a list of gene stable Ids that were deleted @@ -34,6 +33,7 @@ Options: containing a list of transcript stable Ids that were deleted --skip_ncrna|skip_ncRNA|skip_nc=0|1 (optionally) skip ncRNAs + --skip_biotypes=LIST (optionally) skip LISTed biotypes --logfile, --log=FILE log to FILE (default: *STDOUT) --logpath=PATH write logfile to PATH (default: .) @@ -101,10 +101,10 @@ $support->parse_extra_options( 'altuser=s', 'altpass=s', 'altdbname=s', - 'mapping_session_id|msi=n', 'gene_stable_id_file|gsi_file|gsi=s', 'transcript_stable_id_file|tsi_file|tsi=s', - 'skip_ncrna|skip_ncRNA|skip_nc=s' + 'skip_ncrna|skip_ncRNA|skip_nc=s', + 'skip_biotypes=s@' ); $support->allowed_params( $support->get_common_params, @@ -113,10 +113,10 @@ $support->allowed_params( 'altuser', 'altpass', 'altdbname', - 'mapping_session_id', 'gene_stable_id_file', 'transcript_stable_id_file', 'skip_ncrna', + 'skip_biotypes', ); if ($support->param('help') or $support->error) { @@ -124,6 +124,8 @@ if ($support->param('help') or $support->error) { pod2usage(1); } +$support->comma_to_list('skip_biotypes'); + # ask user to confirm parameters to proceed $support->confirm_params; @@ -135,7 +137,6 @@ $support->check_required_params( 'altport', 'altuser', 'altdbname', - 'mapping_session_id', 'gene_stable_id_file', ); @@ -143,235 +144,357 @@ $support->check_required_params( my $dba_new = $support->get_database('core'); my $dba_old = $support->get_database('ensembl', 'alt'); my $dbh_new = $dba_new->dbc->db_handle; -my $ta = $dba_old->get_TranscriptAdaptor; -my $ga = $dba_old->get_GeneAdaptor; - -my $sql; -my $c; -my $sth; -# -# read list of deleted gene_stable_ids from file -# -$support->log_stamped("Reading list of deleted gene_stable_ids from file, and fetching associated transcript and translation stable IDs from the db...\n"); +# define some globally used variables +my %genes; my %gsi; my %tsi; my %tlsi; -my %genes; -my $gfh = $support->filehandle('<', $support->param('gene_stable_id_file')); +my $gsi_string; +my $tsi_string; +my $tlsi_string; -while (my $g = <$gfh>) { - chomp $g; - my $gene = $ga->fetch_by_stable_id($g); +# read list of deleted gene and transcript stable IDs from file(s) +&parse_deleted_files; - # skip non-protein-coding genes - unless ($gene->biotype eq 'protein_coding') { - $support->log_warning("Gene ".$gene->stable_id." is non-protein_coding, skipping.\n", 1); - next; - } - - $genes{$g} = $gene; - $gsi{$g} = 1; +# create new mapping session +my $mapping_session_id = &create_mapping_session; - # fetch associated transcript and translation stable IDs from the 37 db - foreach my $transcript (@{ $gene->get_all_Transcripts }) { - $tsi{$transcript->stable_id} = 1; - $tlsi{$transcript->translation->stable_id} = 1; - } -} +# create stable_id_event entries for all objects, mapping to themselves +&create_stable_id_events; + +# set stable_id_event.new_stable_id to NULL for deleted objects +&mark_deleted; -# -# read list of deleted transcript_stable_ids from file -# -if ($support->param('transcript_stable_id_file')) { +# populate gene_archive and peptide_archive +&populated_archive; + +# finish logfile +$support->finish_log; + + +### END main ### - $support->log_stamped("Reading list of deleted transcript_stable_ids from file, and fetching associated translation stable IDs from the db...\n"); +sub parse_deleted_files { - my $tfh = $support->filehandle('<', $support->param('transcript_stable_id_file')); + my $ta = $dba_old->get_TranscriptAdaptor; + my $ga = $dba_old->get_GeneAdaptor; - while (my $t = <$tfh>) { - chomp $t; - my $transcript = $ta->fetch_by_stable_id($t); + # + # read list of deleted gene_stable_ids from file + # + $support->log_stamped("Reading list of deleted gene_stable_ids from file, and fetching associated transcript and translation stable IDs from the db...\n"); + my $gfh = $support->filehandle('<', $support->param('gene_stable_id_file')); + + while (my $g = <$gfh>) { + chomp $g; + my $gene = $ga->fetch_by_stable_id($g); # skip non-protein-coding genes - unless ($transcript->biotype eq 'protein_coding') { - $support->log_warning("Transcript ".$transcript->stable_id." is non-protein_coding, skipping.\n", 1); + unless ($gene->biotype eq 'protein_coding') { + $support->log_warning("Gene ".$gene->stable_id." is non-protein_coding, skipping.\n", 1); next; } - $tsi{$transcript->stable_id} = 1; - $tlsi{$transcript->translation->stable_id} = 1; - - my $gene = $ga->fetch_by_transcript_id($transcript->dbID); - $genes{$gene->stable_id} = $gene; + $genes{$g} = $gene; + $gsi{$g} = 1; + + # fetch associated transcript and translation stable IDs from the old db + foreach my $transcript (@{ $gene->get_all_Transcripts }) { + $tsi{$transcript->stable_id} = 1; + $tlsi{$transcript->translation->stable_id} = 1; + } } -} -my $gsi_string = "'".join("', '", keys(%gsi))."'"; -my $tsi_string = "'".join("', '", keys(%tsi))."'"; -my $tlsi_string = "'".join("', '", keys(%tlsi))."'"; + # + # read list of deleted transcript_stable_ids from file + # + if ($support->param('transcript_stable_id_file')) { + + $support->log_stamped("Reading list of deleted transcript_stable_ids from file, and fetching associated translation stable IDs from the db...\n"); + + my $tfh = $support->filehandle('<', $support->param('transcript_stable_id_file')); + + while (my $t = <$tfh>) { + chomp $t; + my $transcript = $ta->fetch_by_stable_id($t); + + # skip non-protein-coding genes + unless ($transcript->biotype eq 'protein_coding') { + $support->log_warning("Transcript ".$transcript->stable_id." is non-protein_coding, skipping.\n", 1); + next; + } + + $tsi{$transcript->stable_id} = 1; + $tlsi{$transcript->translation->stable_id} = 1; + + my $gene = $ga->fetch_by_transcript_id($transcript->dbID); + $genes{$gene->stable_id} = $gene; + } + } + + $gsi_string = "'".join("', '", keys(%gsi))."'"; + $tsi_string = "'".join("', '", keys(%tsi))."'"; + $tlsi_string = "'".join("', '", keys(%tlsi))."'"; -$support->log_stamped("Done loading ".scalar(keys(%gsi))." gene, ".scalar(keys(%tsi))." transcript and ".scalar(keys(%tlsi))." translation stable IDs.\n\n"); + $support->log_stamped("Done loading ".scalar(keys(%gsi))." gene, ".scalar(keys(%tsi))." transcript and ".scalar(keys(%tlsi))." translation stable IDs.\n\n"); -# exit now if doing a dry run -if ($support->param('dry_run')) { - $support->log("Nothing else to do for a dry run. Exiting.\n\n"); - $support->finish_log; - exit; } -# create a new mapping session -$support->log("Creating new mapping session...\n"); -my $old_db_name = $support->param('altdbname'); -my $new_db_name = $support->param('dbname'); -$sql = qq( - INSERT INTO mapping_session (old_db_name, new_db_name, created) - VALUES ('$old_db_name', '$new_db_name', NOW()) -); -$c = $dbh_new->do($sql); -my $mapping_session_id = $dbh_new->{'mysql_insertid'}; -$support->log("Done.\n\n"); -# -# create stable_id_event entries for all objects, mapping to themselves -# -$support->log_stamped("Creating stable_id_event entries for all objects, mapping to themselves...\n"); -my $msi = $support->param('mapping_session_id'); -$sql = qq( - SELECT new_stable_id, new_version, mapping_session_id, type - FROM stable_id_event - WHERE mapping_session_id = $msi - AND new_stable_id IS NOT NULL -); -$sth = $dbh_new->prepare($sql); -$sth->execute; +sub create_mapping_session { -$c = 0; -my %unique_sie; + # create a new mapping session + $support->log("Creating new mapping session...\n"); + my $old_db_name = $support->param('altdbname'); + my $new_db_name = $support->param('dbname'); + my $sql = qq( + INSERT INTO mapping_session (old_db_name, new_db_name, created) + VALUES ('$old_db_name', '$new_db_name', NOW()) + ); + my $c = $dbh_new->do($sql) unless ($support->param('dry_run')); + my $mapping_session_id = $dbh_new->{'mysql_insertid'}; + $support->log("Done.\n\n"); -while (my $r = $sth->fetchrow_hashref) { - $unique_sie{$r->{'new_stable_id'}.":".$r->{'new_version'}} = $r; - $c++; + return $mapping_session_id; } -$sth->finish; -$sql = qq( - INSERT INTO stable_id_event - VALUES (?, ?, ?, ?, ?, ?, ?) -); -$sth = $dbh_new->prepare($sql); - -# optionally skip ncRNAs -my %nc_genes = (); -if ($support->param('skip_ncrna')) { - foreach my $biotype (qw(miRNA misc_RNA Mt-tRNA Mt-rRNA rRNA snoRNA snRNA)) { - map { $nc_genes{$_->stable_id} = 1 } - @{ $ga->fetch_all_by_biotype($biotype) }; - } -} +sub create_stable_id_events { + + # + # create stable_id_event entries for all objects, mapping to themselves + # + $support->log_stamped("Creating stable_id_event entries for all objects, mapping to themselves...\n"); + + my $ga = $dba_old->get_GeneAdaptor; + my @genes = @{ $ga->fetch_all }; + + my $sql = qq( + INSERT INTO stable_id_event + VALUES (?, ?, ?, ?, ?, ?, ?) + ); + my $sth = $dbh_new->prepare($sql); -foreach my $k (keys %unique_sie) { # optionally skip ncRNAs - next if ($nc_genes{$unique_sie{$k}->{'new_stable_id'}}); - - $sth->execute( - $unique_sie{$k}->{'new_stable_id'}, - $unique_sie{$k}->{'new_version'}, - $unique_sie{$k}->{'new_stable_id'}, - $unique_sie{$k}->{'new_version'}, - $mapping_session_id, - $unique_sie{$k}->{'type'}, - 1 + # + # this is the complete list of ncRNA biotype; you might need to update it (the + # code below will try to help you with this) + my @nc_biotypes = qw( + miRNA + miRNA_pseudogene + misc_RNA + misc_RNA_pseudogene + Mt_rRNA + Mt_tRNA + Mt_tRNA_pseudogene + rRNA + rRNA_pseudogene + scRNA + scRNA_pseudogene + snoRNA + snoRNA_pseudogene + snRNA + snRNA_pseudogene + tRNA_pseudogene ); -} -$sth->finish; + my %skip_biotypes = (); -my $u = scalar(keys(%unique_sie)); -$support->log_stamped("Done inserting $u entries (ignoring ".($c-$u)." duplicates).\n\n"); + if ($support->param('skip_ncrna')) { -# set stable_id_event.new_stable_id to NULL for deleted objects -$support->log_stamped("Setting new_stable_id to NULL for deleted objects...\n"); - -$support->log("Genes... ", 1); -$sql = qq( - UPDATE stable_id_event - SET new_stable_id = NULL, new_version = 0 - WHERE new_stable_id IN ($gsi_string) - AND mapping_session_id = $mapping_session_id -); -$c = $dbh_new->do($sql); -$support->log("[$c]\n"); - -$support->log("Transcripts... ", 1); -$sql = qq( - UPDATE stable_id_event - SET new_stable_id = NULL, new_version = 0 - WHERE new_stable_id IN ($tsi_string) - AND mapping_session_id = $mapping_session_id -); -$c = $dbh_new->do($sql); -$support->log("[$c]\n"); - -$support->log("Translations... ", 1); -$sql = qq( - UPDATE stable_id_event - SET new_stable_id = NULL, new_version = 0 - WHERE new_stable_id IN ($tlsi_string) - AND mapping_session_id = $mapping_session_id -); -$c = $dbh_new->do($sql); -$support->log("[$c]\n"); + %skip_biotypes = map { $_ => 1 } @nc_biotypes; + + # make sure we have a complete list of ncRNA biotypes + $sql = qq(SELECT DISTINCT biotype from gene); + my $sth1 = $dbh_new->prepare($sql); + $sth1->execute; + my @biotypes_db; + + while ((my $biotype) = $sth1->fetchrow_array) { + push @biotypes_db, $biotype unless ($skip_biotypes{$biotype}); + } -$support->log_stamped("Done.\n\n"); + $sth1->finish; -# populate gene_archive and peptide_archive -$support->log_stamped("Populating gene_archive and peptide_archive...\n"); + if (@biotypes_db) { + print "These are the non-ncRNA biotypes found in the db:\n"; + map { print " $_\n" } @biotypes_db; + print "\nPlease check that the list of ncRNA biotypes is still complete, otherwise adapt the script.\n"; + exit unless $support->user_proceed("Continue?"); + } + } -my $sth_gene = $dbh_new->prepare(qq( - INSERT INTO gene_archive - VALUES (?, ?, ?, ?, ?, ?, ?, ?) -)); -my $sth_pep = $dbh_new->prepare(qq( - INSERT INTO peptide_archive (md5_checksum, peptide_seq) - VALUES (?, ?) -)); + # optionally skip other biotypes + if ($support->param('skip_biotypes')) { + %skip_biotypes = map { $_ => 1 } $support->param('skip_biotypes'); + } -$c = 0; + my %stats = map { $_ => 0 } qw(g tr tl g_tot tr_tot); + my $num_genes = scalar(@genes); + my $i; -foreach my $gsi (keys(%genes)) { - my $gene = $genes{$gsi}; + while (my $gene = shift(@genes)) { - foreach my $trans (@{ $gene->get_all_Transcripts }) { - - # skip transcripts that were not deleted (since %genes may contain genes - # were only some but not all transcripts were deleted) - next unless ($tsi{$trans->stable_id}); + $support->log_progress($num_genes, ++$i); - my $tl = $trans->translation; - - # add peptide_archive entry - $sth_pep->execute(md5_hex($trans->translate->seq), $trans->translate->seq); - my $pid = $dbh_new->{'mysql_insertid'}; + $stats{g_tot}++; + + next if ($skip_biotypes{$gene->biotype}); - # add gene_archive entry - $sth_gene->execute( + unless ($support->param('dry_run')) { + $sth->execute( $gene->stable_id, $gene->version, - $trans->stable_id, - $trans->version, - $tl->stable_id, - $tl->version, - $pid, - $mapping_session_id - ); - - $c++; + $gene->stable_id, + $gene->version, + $mapping_session_id, + 'gene', + 1 + ); + } + + $stats{g}++; + + # transcripts + my @transcripts = @{ $gene->get_all_Transcripts }; + while (my $tr = shift(@transcripts)) { + + $stats{tr_tot}++; + + next if ($skip_biotypes{$tr->biotype}); + + unless ($support->param('dry_run')) { + $sth->execute( + $tr->stable_id, + $tr->version, + $tr->stable_id, + $tr->version, + $mapping_session_id, + 'transcript', + 1 + ); + } + + $stats{tr}++; + + # translations + if (my $tl = $tr->translation) { + + unless ($support->param('dry_run')) { + $sth->execute( + $tl->stable_id, + $tl->version, + $tl->stable_id, + $tl->version, + $mapping_session_id, + 'translation', + 1 + ); + } + + $stats{tl}++; + } + + } + } + + $sth->finish; + + $support->log_stamped("Done inserting entries for $stats{g} (of $stats{g_tot}) genes, $stats{tr} (of $stats{tr_tot}) transcripts, $stats{tl} translations.\n\n"); + } -$support->log_stamped("Done adding $c entries.\n\n"); -# finish logfile -$support->finish_log; +sub mark_deleted { + + # set stable_id_event.new_stable_id to NULL for deleted objects + $support->log_stamped("Setting new_stable_id to NULL for deleted objects...\n"); + + $support->log("Genes... ", 1); + my $sql = qq( + UPDATE stable_id_event + SET new_stable_id = NULL, new_version = 0 + WHERE new_stable_id IN ($gsi_string) + AND mapping_session_id = $mapping_session_id + ); + my $c = $dbh_new->do($sql) unless ($support->param('dry_run')); + $support->log("[$c]\n"); + + $support->log("Transcripts... ", 1); + $sql = qq( + UPDATE stable_id_event + SET new_stable_id = NULL, new_version = 0 + WHERE new_stable_id IN ($tsi_string) + AND mapping_session_id = $mapping_session_id + ); + $c = $dbh_new->do($sql) unless ($support->param('dry_run')); + $support->log("[$c]\n"); + + $support->log("Translations... ", 1); + $sql = qq( + UPDATE stable_id_event + SET new_stable_id = NULL, new_version = 0 + WHERE new_stable_id IN ($tlsi_string) + AND mapping_session_id = $mapping_session_id + ); + $c = $dbh_new->do($sql) unless ($support->param('dry_run')); + $support->log("[$c]\n"); + + $support->log_stamped("Done.\n\n"); + +} + + +sub populated_archive { + + # populate gene_archive and peptide_archive + $support->log_stamped("Populating gene_archive and peptide_archive...\n"); + + my $sth_gene = $dbh_new->prepare(qq( + INSERT INTO gene_archive + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + )); + my $sth_pep = $dbh_new->prepare(qq( + INSERT INTO peptide_archive (md5_checksum, peptide_seq) + VALUES (?, ?) + )); + + my $c = 0; + + foreach my $gsi (keys(%genes)) { + my $gene = $genes{$gsi}; + + foreach my $trans (@{ $gene->get_all_Transcripts }) { + + # skip transcripts that were not deleted (since %genes may contain genes + # where only some but not all transcripts were deleted) + next unless ($tsi{$trans->stable_id}); + + my $tl = $trans->translation; + + # add peptide_archive entry + unless ($support->param('dry_run')) { + $sth_pep->execute(md5_hex($trans->translate->seq), $trans->translate->seq); + my $pid = $dbh_new->{'mysql_insertid'}; + + # add gene_archive entry + $sth_gene->execute( + $gene->stable_id, + $gene->version, + $trans->stable_id, + $trans->version, + $tl->stable_id, + $tl->version, + $pid, + $mapping_session_id + ); + } + + $c++; + } + } + $support->log_stamped("Done adding $c entries.\n\n"); + +}