Skip to content
Snippets Groups Projects
Commit f9fbec4c authored by Nathan Johnson's avatar Nathan Johnson
Browse files

added cache_and_load method to prevent memory errors with unmapped object cache

parent 54e4ea4a
No related branches found
No related tags found
No related merge requests found
......@@ -13,30 +13,32 @@ my ($transcript_host, $transcript_port, $transcript_user, $transcript_pass, $tra
$max_mismatches, $utr_length, $max_transcripts_per_probeset, $max_transcripts, @arrays, $delete,
$mapping_threshold, $no_triage, $health_check);
my $first_cache = 1;
GetOptions('transcript_host=s' => \$transcript_host,
'transcript_user=s' => \$transcript_user,
'transcript_port=i' => \$transcript_port,
'transcript_pass=s' => \$transcript_pass,
'transcript_dbname=s' => \$transcript_dbname,
'oligo_host=s' => \$oligo_host,
'oligo_host=s' => \$oligo_host,
'oligo_user=s' => \$oligo_user,
'oligo_port=i' => \$oligo_port,
'oligo_pass=s' => \$oligo_pass,
'oligo_dbname=s' => \$oligo_dbname,
'xref_host=s' => \$xref_host,
'xref_host=s' => \$xref_host,
'xref_user=s' => \$xref_user,
'xref_port=i' => \$xref_port,
'xref_pass=s' => \$xref_pass,
'xref_dbname=s' => \$xref_dbname,
'mismatches=i' => \$max_mismatches,
'mismatches=i' => \$max_mismatches,
'utr_length=i' => \$utr_length,
'max_probesets=i' => \$max_transcripts_per_probeset,
'max_transcripts=i' => \$max_transcripts,
'threshold=s' => \$mapping_threshold,
'arrays=s' => \@arrays,
'delete' => \$delete,
'no_triage' => \$no_triage,
'health_check' => \$health_check,
'max_probesets=i' => \$max_transcripts_per_probeset,
'max_transcripts=i' => \$max_transcripts,
'threshold=s' => \$mapping_threshold,
'arrays=s' => \@arrays,
'delete' => \$delete,
'no_triage' => \$no_triage,
'health_check' => \$health_check,
'help' => sub { usage(); exit(0); });
$transcript_port ||= 3306; $oligo_port ||= 3306; $xref_port ||= 3306;
......@@ -117,7 +119,6 @@ my %promiscuous_probesets;
my %transcripts_per_probeset;
my %transcript_ids;
my %transcript_probeset_count; # key: transcript:probeset value: count
my %arrays_per_probeset;
......@@ -137,11 +138,12 @@ my $total = scalar(@transcripts);
open (LOG, ">${transcript_dbname}_probe2transcript.log");
print "Identified ".scalar(@transcripts)." for probe mappinng\n";
print "Mapping, percentage complete: ";
foreach my $transcript (@transcripts) {
my $pc = int ((100 * $i) / $total);
if ($pc > $last_pc) {
......@@ -183,7 +185,6 @@ foreach my $transcript (@transcripts) {
}
foreach my $feature (@$oligo_features) {
#next if ($transcript->strand() != $feature->strand()); # XXX log this
my $probe = $feature->probe();
......@@ -200,27 +201,48 @@ foreach my $transcript (@transcripts) {
if ($exon_overlap >= $min_overlap) {
#print "Exon overlap $exon_overlap excedes minimum overlap $min_overlap\n";
$transcript_probeset_count{$transcript_key}{$probe_name}++;
} elsif ($utr_overlap >= $min_overlap) {
#print "UTR overlap $utr_overlap excedes minimum overlap $min_overlap\n";
$transcript_probeset_count{$transcript_key}{$probe_name}++;
} else { # must be intronic
#print "Intronic\n";
print LOG "Unmapped intronic " . $transcript->stable_id . "\t" . $probeset . " probe length $probe_length\n";
if (!$no_triage) {
push @unmapped_objects, new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
-analysis => $analysis,
-identifier => $probeset,
-summary => "Unmapped intronic",
-full_desc => "Probe mapped to intronic region of transcript",
-ensembl_object_type => 'Transcript',
-ensembl_id => $transcript->dbID());
}
}
my $um_obj = new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
-analysis => $analysis,
-identifier => $probeset,
-summary => "Unmapped intronic",
-full_desc => "Probe mapped to intronic region of transcript",
-ensembl_object_type => 'Transcript',
-ensembl_id => $transcript->dbID());
&cache_and_load_unmapped_objects($um_obj);
#push @unmapped_objects, new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
# -analysis => $analysis,
# -identifier => $probeset,
# -summary => "Unmapped intronic",
# -full_desc => "Probe mapped to intronic region of transcript",
# -ensembl_object_type => 'Transcript',
# -ensembl_id => $transcript->dbID());
}
}
}
# TODO - make external_db array names == array names in OligoArray!
......@@ -278,13 +300,26 @@ foreach my $key (keys %transcript_probeset_count) {
print LOG "$probeset\t$transcript\tinsufficient\t$probeset_size\t$hits\n";
if (!$no_triage) {
push @unmapped_objects, new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
-analysis => $analysis,
-identifier => $probeset,
-summary => "Insufficient hits",
-full_desc => "Probe had an insufficient number of hits (probeset size = $probeset_size, hits = $hits)",
-ensembl_object_type => 'Transcript',
-ensembl_id => $transcript_ids{$transcript});
my $um_obj = new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
-analysis => $analysis,
-identifier => $probeset,
-summary => "Insufficient hits",
-full_desc => "Probe had an insufficient number of hits (probeset size = $probeset_size, hits = $hits)",
-ensembl_object_type => 'Transcript',
-ensembl_id => $transcript_ids{$transcript});
#push @unmapped_objects, new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
# -analysis => $analysis,
# -identifier => $probeset,
# -summary => "Insufficient hits",
# -full_desc => "Probe had an insufficient number of hits (probeset size = $probeset_size, hits = $hits)",
# -ensembl_object_type => 'Transcript',
# -ensembl_id => $transcript_ids{$transcript});
&cache_and_load_unmapped_objects($um_obj);
}
}
......@@ -293,6 +328,10 @@ foreach my $key (keys %transcript_probeset_count) {
}
#can we load first batch of unmapped_object here to save memmory
# Find probesets that don't match any transcripts at all, write to log file
log_orphan_probes();
......@@ -306,6 +345,32 @@ if (!$no_triage) {
}
# ----------------------------------------------------------------------
# only loads unless cache hits size limit
sub cache_and_load_unmapped_objects{
my ($um_obj) = @_;
push @unmapped_objects, $um_obj;
if(scalar(@unmapped_objects) >10000){
#print "Uploading " . scalar(@unmapped_objects) . " unmapped reasons to xref database\n";
if($first_cache){
$unmapped_objects[0]->dbID('2000');
$first_cache = 0;
}
$unmapped_object_adaptor->store(@unmapped_objects);
@unmapped_objects = ();
}
}
# ----------------------------------------------------------------------
sub log_orphan_probes {
......@@ -319,11 +384,19 @@ sub log_orphan_probes {
print LOG "$probeset\tNo transcript mappings\n";
if (!$no_triage && $probeset) {
push @unmapped_objects, new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
-analysis => $analysis,
-identifier => $probeset,
-summary => "No transcript mappings",
-full_desc => "Probeset did not map to any transcripts");
my $um_obj = new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
-analysis => $analysis,
-identifier => $probeset,
-summary => "No transcript mappings",
-full_desc => "Probeset did not map to any transcripts");
&cache_and_load_unmapped_objects($um_obj);
#push @unmapped_objects, new Bio::EnsEMBL::UnmappedObject(-type => 'probe2transcript',
# -analysis => $analysis,
# -identifier => $probeset,
# -summary => "No transcript mappings",
# -full_desc => "Probeset did not map to any transcripts");
}
}
}
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment