Skip to content
Snippets Groups Projects
Commit d5230ce8 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

added --dry_run, removed --mapping_session_id, other fixes

parent a1b10ed0
No related branches found
No related tags found
No related merge requests found
......@@ -25,7 +25,6 @@ Options:
--altuser=USER use old database username USER
--altpass=PASS use old database passwort PASS
--mapping_session_id|msi=ID latest mapping session
--gene_stable_id_file|gsi_file|gsi=FILE
the path of the file containing a list
of gene stable Ids that were deleted
......@@ -34,6 +33,7 @@ Options:
containing a list of transcript stable
Ids that were deleted
--skip_ncrna|skip_ncRNA|skip_nc=0|1 (optionally) skip ncRNAs
--skip_biotypes=LIST (optionally) skip LISTed biotypes
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
......@@ -101,10 +101,10 @@ $support->parse_extra_options(
'altuser=s',
'altpass=s',
'altdbname=s',
'mapping_session_id|msi=n',
'gene_stable_id_file|gsi_file|gsi=s',
'transcript_stable_id_file|tsi_file|tsi=s',
'skip_ncrna|skip_ncRNA|skip_nc=s'
'skip_ncrna|skip_ncRNA|skip_nc=s',
'skip_biotypes=s@'
);
$support->allowed_params(
$support->get_common_params,
......@@ -113,10 +113,10 @@ $support->allowed_params(
'altuser',
'altpass',
'altdbname',
'mapping_session_id',
'gene_stable_id_file',
'transcript_stable_id_file',
'skip_ncrna',
'skip_biotypes',
);
if ($support->param('help') or $support->error) {
......@@ -124,6 +124,8 @@ if ($support->param('help') or $support->error) {
pod2usage(1);
}
$support->comma_to_list('skip_biotypes');
# ask user to confirm parameters to proceed
$support->confirm_params;
......@@ -135,7 +137,6 @@ $support->check_required_params(
'altport',
'altuser',
'altdbname',
'mapping_session_id',
'gene_stable_id_file',
);
......@@ -143,235 +144,357 @@ $support->check_required_params(
my $dba_new = $support->get_database('core');
my $dba_old = $support->get_database('ensembl', 'alt');
my $dbh_new = $dba_new->dbc->db_handle;
my $ta = $dba_old->get_TranscriptAdaptor;
my $ga = $dba_old->get_GeneAdaptor;
my $sql;
my $c;
my $sth;
#
# read list of deleted gene_stable_ids from file
#
$support->log_stamped("Reading list of deleted gene_stable_ids from file, and fetching associated transcript and translation stable IDs from the db...\n");
# define some globally used variables
my %genes;
my %gsi;
my %tsi;
my %tlsi;
my %genes;
my $gfh = $support->filehandle('<', $support->param('gene_stable_id_file'));
my $gsi_string;
my $tsi_string;
my $tlsi_string;
while (my $g = <$gfh>) {
chomp $g;
my $gene = $ga->fetch_by_stable_id($g);
# read list of deleted gene and transcript stable IDs from file(s)
&parse_deleted_files;
# skip non-protein-coding genes
unless ($gene->biotype eq 'protein_coding') {
$support->log_warning("Gene ".$gene->stable_id." is non-protein_coding, skipping.\n", 1);
next;
}
$genes{$g} = $gene;
$gsi{$g} = 1;
# create new mapping session
my $mapping_session_id = &create_mapping_session;
# fetch associated transcript and translation stable IDs from the 37 db
foreach my $transcript (@{ $gene->get_all_Transcripts }) {
$tsi{$transcript->stable_id} = 1;
$tlsi{$transcript->translation->stable_id} = 1;
}
}
# create stable_id_event entries for all objects, mapping to themselves
&create_stable_id_events;
# set stable_id_event.new_stable_id to NULL for deleted objects
&mark_deleted;
#
# read list of deleted transcript_stable_ids from file
#
if ($support->param('transcript_stable_id_file')) {
# populate gene_archive and peptide_archive
&populated_archive;
# finish logfile
$support->finish_log;
### END main ###
$support->log_stamped("Reading list of deleted transcript_stable_ids from file, and fetching associated translation stable IDs from the db...\n");
sub parse_deleted_files {
my $tfh = $support->filehandle('<', $support->param('transcript_stable_id_file'));
my $ta = $dba_old->get_TranscriptAdaptor;
my $ga = $dba_old->get_GeneAdaptor;
while (my $t = <$tfh>) {
chomp $t;
my $transcript = $ta->fetch_by_stable_id($t);
#
# read list of deleted gene_stable_ids from file
#
$support->log_stamped("Reading list of deleted gene_stable_ids from file, and fetching associated transcript and translation stable IDs from the db...\n");
my $gfh = $support->filehandle('<', $support->param('gene_stable_id_file'));
while (my $g = <$gfh>) {
chomp $g;
my $gene = $ga->fetch_by_stable_id($g);
# skip non-protein-coding genes
unless ($transcript->biotype eq 'protein_coding') {
$support->log_warning("Transcript ".$transcript->stable_id." is non-protein_coding, skipping.\n", 1);
unless ($gene->biotype eq 'protein_coding') {
$support->log_warning("Gene ".$gene->stable_id." is non-protein_coding, skipping.\n", 1);
next;
}
$tsi{$transcript->stable_id} = 1;
$tlsi{$transcript->translation->stable_id} = 1;
my $gene = $ga->fetch_by_transcript_id($transcript->dbID);
$genes{$gene->stable_id} = $gene;
$genes{$g} = $gene;
$gsi{$g} = 1;
# fetch associated transcript and translation stable IDs from the old db
foreach my $transcript (@{ $gene->get_all_Transcripts }) {
$tsi{$transcript->stable_id} = 1;
$tlsi{$transcript->translation->stable_id} = 1;
}
}
}
my $gsi_string = "'".join("', '", keys(%gsi))."'";
my $tsi_string = "'".join("', '", keys(%tsi))."'";
my $tlsi_string = "'".join("', '", keys(%tlsi))."'";
#
# read list of deleted transcript_stable_ids from file
#
if ($support->param('transcript_stable_id_file')) {
$support->log_stamped("Reading list of deleted transcript_stable_ids from file, and fetching associated translation stable IDs from the db...\n");
my $tfh = $support->filehandle('<', $support->param('transcript_stable_id_file'));
while (my $t = <$tfh>) {
chomp $t;
my $transcript = $ta->fetch_by_stable_id($t);
# skip non-protein-coding genes
unless ($transcript->biotype eq 'protein_coding') {
$support->log_warning("Transcript ".$transcript->stable_id." is non-protein_coding, skipping.\n", 1);
next;
}
$tsi{$transcript->stable_id} = 1;
$tlsi{$transcript->translation->stable_id} = 1;
my $gene = $ga->fetch_by_transcript_id($transcript->dbID);
$genes{$gene->stable_id} = $gene;
}
}
$gsi_string = "'".join("', '", keys(%gsi))."'";
$tsi_string = "'".join("', '", keys(%tsi))."'";
$tlsi_string = "'".join("', '", keys(%tlsi))."'";
$support->log_stamped("Done loading ".scalar(keys(%gsi))." gene, ".scalar(keys(%tsi))." transcript and ".scalar(keys(%tlsi))." translation stable IDs.\n\n");
$support->log_stamped("Done loading ".scalar(keys(%gsi))." gene, ".scalar(keys(%tsi))." transcript and ".scalar(keys(%tlsi))." translation stable IDs.\n\n");
# exit now if doing a dry run
if ($support->param('dry_run')) {
$support->log("Nothing else to do for a dry run. Exiting.\n\n");
$support->finish_log;
exit;
}
# create a new mapping session
$support->log("Creating new mapping session...\n");
my $old_db_name = $support->param('altdbname');
my $new_db_name = $support->param('dbname');
$sql = qq(
INSERT INTO mapping_session (old_db_name, new_db_name, created)
VALUES ('$old_db_name', '$new_db_name', NOW())
);
$c = $dbh_new->do($sql);
my $mapping_session_id = $dbh_new->{'mysql_insertid'};
$support->log("Done.\n\n");
#
# create stable_id_event entries for all objects, mapping to themselves
#
$support->log_stamped("Creating stable_id_event entries for all objects, mapping to themselves...\n");
my $msi = $support->param('mapping_session_id');
$sql = qq(
SELECT new_stable_id, new_version, mapping_session_id, type
FROM stable_id_event
WHERE mapping_session_id = $msi
AND new_stable_id IS NOT NULL
);
$sth = $dbh_new->prepare($sql);
$sth->execute;
sub create_mapping_session {
$c = 0;
my %unique_sie;
# create a new mapping session
$support->log("Creating new mapping session...\n");
my $old_db_name = $support->param('altdbname');
my $new_db_name = $support->param('dbname');
my $sql = qq(
INSERT INTO mapping_session (old_db_name, new_db_name, created)
VALUES ('$old_db_name', '$new_db_name', NOW())
);
my $c = $dbh_new->do($sql) unless ($support->param('dry_run'));
my $mapping_session_id = $dbh_new->{'mysql_insertid'};
$support->log("Done.\n\n");
while (my $r = $sth->fetchrow_hashref) {
$unique_sie{$r->{'new_stable_id'}.":".$r->{'new_version'}} = $r;
$c++;
return $mapping_session_id;
}
$sth->finish;
$sql = qq(
INSERT INTO stable_id_event
VALUES (?, ?, ?, ?, ?, ?, ?)
);
$sth = $dbh_new->prepare($sql);
# optionally skip ncRNAs
my %nc_genes = ();
if ($support->param('skip_ncrna')) {
foreach my $biotype (qw(miRNA misc_RNA Mt-tRNA Mt-rRNA rRNA snoRNA snRNA)) {
map { $nc_genes{$_->stable_id} = 1 }
@{ $ga->fetch_all_by_biotype($biotype) };
}
}
sub create_stable_id_events {
#
# create stable_id_event entries for all objects, mapping to themselves
#
$support->log_stamped("Creating stable_id_event entries for all objects, mapping to themselves...\n");
my $ga = $dba_old->get_GeneAdaptor;
my @genes = @{ $ga->fetch_all };
my $sql = qq(
INSERT INTO stable_id_event
VALUES (?, ?, ?, ?, ?, ?, ?)
);
my $sth = $dbh_new->prepare($sql);
foreach my $k (keys %unique_sie) {
# optionally skip ncRNAs
next if ($nc_genes{$unique_sie{$k}->{'new_stable_id'}});
$sth->execute(
$unique_sie{$k}->{'new_stable_id'},
$unique_sie{$k}->{'new_version'},
$unique_sie{$k}->{'new_stable_id'},
$unique_sie{$k}->{'new_version'},
$mapping_session_id,
$unique_sie{$k}->{'type'},
1
#
# this is the complete list of ncRNA biotype; you might need to update it (the
# code below will try to help you with this)
my @nc_biotypes = qw(
miRNA
miRNA_pseudogene
misc_RNA
misc_RNA_pseudogene
Mt_rRNA
Mt_tRNA
Mt_tRNA_pseudogene
rRNA
rRNA_pseudogene
scRNA
scRNA_pseudogene
snoRNA
snoRNA_pseudogene
snRNA
snRNA_pseudogene
tRNA_pseudogene
);
}
$sth->finish;
my %skip_biotypes = ();
my $u = scalar(keys(%unique_sie));
$support->log_stamped("Done inserting $u entries (ignoring ".($c-$u)." duplicates).\n\n");
if ($support->param('skip_ncrna')) {
# set stable_id_event.new_stable_id to NULL for deleted objects
$support->log_stamped("Setting new_stable_id to NULL for deleted objects...\n");
$support->log("Genes... ", 1);
$sql = qq(
UPDATE stable_id_event
SET new_stable_id = NULL, new_version = 0
WHERE new_stable_id IN ($gsi_string)
AND mapping_session_id = $mapping_session_id
);
$c = $dbh_new->do($sql);
$support->log("[$c]\n");
$support->log("Transcripts... ", 1);
$sql = qq(
UPDATE stable_id_event
SET new_stable_id = NULL, new_version = 0
WHERE new_stable_id IN ($tsi_string)
AND mapping_session_id = $mapping_session_id
);
$c = $dbh_new->do($sql);
$support->log("[$c]\n");
$support->log("Translations... ", 1);
$sql = qq(
UPDATE stable_id_event
SET new_stable_id = NULL, new_version = 0
WHERE new_stable_id IN ($tlsi_string)
AND mapping_session_id = $mapping_session_id
);
$c = $dbh_new->do($sql);
$support->log("[$c]\n");
%skip_biotypes = map { $_ => 1 } @nc_biotypes;
# make sure we have a complete list of ncRNA biotypes
$sql = qq(SELECT DISTINCT biotype from gene);
my $sth1 = $dbh_new->prepare($sql);
$sth1->execute;
my @biotypes_db;
while ((my $biotype) = $sth1->fetchrow_array) {
push @biotypes_db, $biotype unless ($skip_biotypes{$biotype});
}
$support->log_stamped("Done.\n\n");
$sth1->finish;
# populate gene_archive and peptide_archive
$support->log_stamped("Populating gene_archive and peptide_archive...\n");
if (@biotypes_db) {
print "These are the non-ncRNA biotypes found in the db:\n";
map { print " $_\n" } @biotypes_db;
print "\nPlease check that the list of ncRNA biotypes is still complete, otherwise adapt the script.\n";
exit unless $support->user_proceed("Continue?");
}
}
my $sth_gene = $dbh_new->prepare(qq(
INSERT INTO gene_archive
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
));
my $sth_pep = $dbh_new->prepare(qq(
INSERT INTO peptide_archive (md5_checksum, peptide_seq)
VALUES (?, ?)
));
# optionally skip other biotypes
if ($support->param('skip_biotypes')) {
%skip_biotypes = map { $_ => 1 } $support->param('skip_biotypes');
}
$c = 0;
my %stats = map { $_ => 0 } qw(g tr tl g_tot tr_tot);
my $num_genes = scalar(@genes);
my $i;
foreach my $gsi (keys(%genes)) {
my $gene = $genes{$gsi};
while (my $gene = shift(@genes)) {
foreach my $trans (@{ $gene->get_all_Transcripts }) {
# skip transcripts that were not deleted (since %genes may contain genes
# were only some but not all transcripts were deleted)
next unless ($tsi{$trans->stable_id});
$support->log_progress($num_genes, ++$i);
my $tl = $trans->translation;
# add peptide_archive entry
$sth_pep->execute(md5_hex($trans->translate->seq), $trans->translate->seq);
my $pid = $dbh_new->{'mysql_insertid'};
$stats{g_tot}++;
next if ($skip_biotypes{$gene->biotype});
# add gene_archive entry
$sth_gene->execute(
unless ($support->param('dry_run')) {
$sth->execute(
$gene->stable_id,
$gene->version,
$trans->stable_id,
$trans->version,
$tl->stable_id,
$tl->version,
$pid,
$mapping_session_id
);
$c++;
$gene->stable_id,
$gene->version,
$mapping_session_id,
'gene',
1
);
}
$stats{g}++;
# transcripts
my @transcripts = @{ $gene->get_all_Transcripts };
while (my $tr = shift(@transcripts)) {
$stats{tr_tot}++;
next if ($skip_biotypes{$tr->biotype});
unless ($support->param('dry_run')) {
$sth->execute(
$tr->stable_id,
$tr->version,
$tr->stable_id,
$tr->version,
$mapping_session_id,
'transcript',
1
);
}
$stats{tr}++;
# translations
if (my $tl = $tr->translation) {
unless ($support->param('dry_run')) {
$sth->execute(
$tl->stable_id,
$tl->version,
$tl->stable_id,
$tl->version,
$mapping_session_id,
'translation',
1
);
}
$stats{tl}++;
}
}
}
$sth->finish;
$support->log_stamped("Done inserting entries for $stats{g} (of $stats{g_tot}) genes, $stats{tr} (of $stats{tr_tot}) transcripts, $stats{tl} translations.\n\n");
}
$support->log_stamped("Done adding $c entries.\n\n");
# finish logfile
$support->finish_log;
sub mark_deleted {
# set stable_id_event.new_stable_id to NULL for deleted objects
$support->log_stamped("Setting new_stable_id to NULL for deleted objects...\n");
$support->log("Genes... ", 1);
my $sql = qq(
UPDATE stable_id_event
SET new_stable_id = NULL, new_version = 0
WHERE new_stable_id IN ($gsi_string)
AND mapping_session_id = $mapping_session_id
);
my $c = $dbh_new->do($sql) unless ($support->param('dry_run'));
$support->log("[$c]\n");
$support->log("Transcripts... ", 1);
$sql = qq(
UPDATE stable_id_event
SET new_stable_id = NULL, new_version = 0
WHERE new_stable_id IN ($tsi_string)
AND mapping_session_id = $mapping_session_id
);
$c = $dbh_new->do($sql) unless ($support->param('dry_run'));
$support->log("[$c]\n");
$support->log("Translations... ", 1);
$sql = qq(
UPDATE stable_id_event
SET new_stable_id = NULL, new_version = 0
WHERE new_stable_id IN ($tlsi_string)
AND mapping_session_id = $mapping_session_id
);
$c = $dbh_new->do($sql) unless ($support->param('dry_run'));
$support->log("[$c]\n");
$support->log_stamped("Done.\n\n");
}
sub populated_archive {
# populate gene_archive and peptide_archive
$support->log_stamped("Populating gene_archive and peptide_archive...\n");
my $sth_gene = $dbh_new->prepare(qq(
INSERT INTO gene_archive
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
));
my $sth_pep = $dbh_new->prepare(qq(
INSERT INTO peptide_archive (md5_checksum, peptide_seq)
VALUES (?, ?)
));
my $c = 0;
foreach my $gsi (keys(%genes)) {
my $gene = $genes{$gsi};
foreach my $trans (@{ $gene->get_all_Transcripts }) {
# skip transcripts that were not deleted (since %genes may contain genes
# where only some but not all transcripts were deleted)
next unless ($tsi{$trans->stable_id});
my $tl = $trans->translation;
# add peptide_archive entry
unless ($support->param('dry_run')) {
$sth_pep->execute(md5_hex($trans->translate->seq), $trans->translate->seq);
my $pid = $dbh_new->{'mysql_insertid'};
# add gene_archive entry
$sth_gene->execute(
$gene->stable_id,
$gene->version,
$trans->stable_id,
$trans->version,
$tl->stable_id,
$tl->version,
$pid,
$mapping_session_id
);
}
$c++;
}
}
$support->log_stamped("Done adding $c entries.\n\n");
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment