From e40c7ca02bd664f918e2df3b2fba458396afd836 Mon Sep 17 00:00:00 2001 From: Arne Stabenau <stabenau@sanger.ac.uk> Date: Wed, 8 Dec 2004 10:34:48 +0000 Subject: [PATCH] essentially rewritten, should be useless soon thanks to Vivek and Craig --- sql/transfer_misc_affy.pl | 194 ++++++++++++++++++++++++++------------ 1 file changed, 132 insertions(+), 62 deletions(-) diff --git a/sql/transfer_misc_affy.pl b/sql/transfer_misc_affy.pl index e4f515acaa..fa6a489d86 100644 --- a/sql/transfer_misc_affy.pl +++ b/sql/transfer_misc_affy.pl @@ -10,13 +10,21 @@ my $affy_array; #hash containing the relation between misc_set_id => affy_array_ my $probe_feature; #hash containing the information relevant to the affy_feature table my $affy_probe; #hash containing the information relevant to the affy_probe table: misc_set_id -> [probeset,probename] my $affy_probe_id = 1; #first affy_probe_id in the affy_probe table + +# unbeknown to me there are actually probesets that contain the same probe twice. +# for the conversion we have to take them out, it will work better when the features +# and probes are generate from the fasta files directly + +my %kill_probeset = ( '892_at' => 1 ); + + { my ($chost, $cuser, $cpass, $cport, $cdbname); #ensembl core db - GetOptions('chost=s' => \$chost, - 'cuser=s' => \$cuser, - 'cpass=s' => \$cpass, - 'cport=i' => \$cport, - 'cdbname=s' => \$cdbname, + GetOptions('host=s' => \$chost, + 'user=s' => \$cuser, + 'pass=s' => \$cpass, + 'port=i' => \$cport, + 'dbname=s' => \$cdbname, 'tmpdir=s' => \$tmp_dir ); #by default, connect to the stagging server at ecs2:3364 @@ -62,68 +70,130 @@ sub populate_affy_array{ } sub populate_probe_info{ - my $dbCore = shift; + my $dbCore = shift; + + my ($seq_region_id, $seq_region_start, $seq_region_end, $seq_region_strand, + $misc_set_id, $complete_probename, $mismatch); + + my ($affy_name, $affy_probeset, $probename); - my ($seq_region_id, $seq_region_start, $seq_region_end, $seq_region_strand, $misc_set_id, $value, $mismatch); - my ($affy_name, $affy_probeset, $probename); #attributes in the value field for an attrib_type_id = 5 - my $previous_seq_region_id = 0; - my $previous_seq_region_start = 0; - my $old_affy_probe; - my $probe_set; - print STDERR "Going to get affy information....\n"; - my $sth = $dbCore->dbc()->prepare(qq{SELECT STRAIGHT_JOIN seq_region_id, seq_region_start, seq_region_end, seq_region_strand, mff.misc_set_id, ma1.value, (IF (ma2.value = 'Mismatch',1,0)) as mismatch - FROM misc_feature mf, misc_attrib ma1, misc_attrib ma2, misc_feature_misc_set mff, attrib_type at1, attrib_type at2, misc_set ms - WHERE ma2.misc_feature_id = mf.misc_feature_id - AND ma2.attrib_type_id = at2.attrib_type_id - AND at2.code = 'matchStatus' - AND ma1.attrib_type_id = at1.attrib_type_id - AND at1.code = 'probeName' - AND ma1.misc_feature_id = mf.misc_feature_id - AND mf.misc_feature_id = mff.misc_feature_id - AND ms.misc_set_id = mff.misc_set_id - AND ms.code <> 'All_Affy' - ORDER BY seq_region_id,seq_region_start - }); + my $previous_seq_region_id = -1; + my $previous_seq_region_start = -1; + my $old_affy_probe; + my $probe_set; + print STDERR "Going to get affy information....\n"; + + + my $sql = + qq{ + SELECT STRAIGHT_JOIN seq_region_id, seq_region_start, seq_region_end, + seq_region_strand, mff.misc_set_id, ma1.value, + (IF (ma2.value = 'Mismatch',1,0)) as mismatch + FROM misc_feature mf, misc_attrib ma1, misc_attrib ma2, + misc_feature_misc_set mff, attrib_type at1, + attrib_type at2, misc_set ms + WHERE ma2.misc_feature_id = mf.misc_feature_id + AND ma2.attrib_type_id = at2.attrib_type_id + AND at2.code = 'matchStatus' + AND ma1.attrib_type_id = at1.attrib_type_id + AND at1.code = 'probeName' + AND ma1.misc_feature_id = mf.misc_feature_id + AND mf.misc_feature_id = mff.misc_feature_id + AND ms.misc_set_id = mff.misc_set_id + AND ms.code <> 'All_Affy' + ORDER BY seq_region_id,seq_region_start }; + + + print STDERR "Ready to create affy files\n"; + open FEATURE, ">$tmp_dir/affy_feature_$$\.txt"; + open PROBE, ">$tmp_dir/affy_probe_$$\.txt"; + + my %stored_probes; + my $current_probe_id = 1; + my %merge_cache; + my $merge_key; + + # merging of probes is only allowes for mismatch = 0 + for my $mismatch_process ( 0..1 ) { + my $sth = $dbCore->dbc()->prepare( $sql ); $sth->{mysql_use_result} = 1; $sth->execute(); - $sth->bind_columns(\$seq_region_id, \$seq_region_start, \$seq_region_end, \$seq_region_strand, \$misc_set_id, \$value, \$mismatch); - print STDERR "Ready to create affy files\n"; - open FEATURE, ">$tmp_dir/affy_feature_$$\.txt"; - open PROBE, ">$tmp_dir/affy_probe_$$\.txt"; + $sth->bind_columns(\$seq_region_id, \$seq_region_start, \$seq_region_end, + \$seq_region_strand, + \$misc_set_id, \$complete_probename, \$mismatch); + + + my $prev_seq_region_id = -1; + my $prev_start = -1; + while($sth->fetch()){ - #we have a new probe, add the previous one to the database, and flush the structures - unless ((($previous_seq_region_id == $seq_region_id) or ($previous_seq_region_id == 0)) and (($previous_seq_region_start == $seq_region_start) or ($previous_seq_region_start == 0))){ - foreach my $key (keys %{$affy_probe}){ - if (!exists $probe_set->{$affy_probeset . ":" . $affy_probe->{$key}}){ - $probe_set->{$affy_probeset . ":" . $affy_probe->{$key}} = $affy_probe_id; - print PROBE join ("\t",$affy_probe_id,$affy_array->{$key},$affy_probeset,$affy_probe->{$key}),"\n"; - } - $old_affy_probe = $probe_set->{$affy_probeset . ":" . $affy_probe->{$key}}; - } - #insert all the affy_probe values in the file - print FEATURE join ("\t",$probe_feature->{'seq_region_id'},$probe_feature->{'seq_region_start'},$probe_feature->{'seq_region_end'}, $probe_feature->{'seq_region_strand'},$probe_feature->{'mismatches'},$old_affy_probe),"\n"; - $affy_probeset = ''; - $affy_probe_id++; - $affy_probe = (); - $probe_feature = (); + + next unless( $mismatch == $mismatch_process ); + + # flush the merge cache regularly + if( $prev_start != $seq_region_start || + $prev_seq_region_id != $seq_region_id ) { + %merge_cache = (); + $prev_start = $seq_region_start; + $prev_seq_region_id = $seq_region_id; + } + + my ($affy_name,$affy_probeset,$probename) = split /:/,$complete_probename,3; + if( $kill_probeset{ $affy_probeset } ) { next; } + + # first check wether we have to store probe information + my $probe_id = $stored_probes{ $complete_probename }; + if( ! $mismatch ) { + $merge_key = join( "-", $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $mismatch, $affy_probeset ); + } + + if( ! defined $probe_id ) { + # probe information needs to be stored, but new probe_id or existing one? + $probe_id = $merge_cache{ $merge_key }; + if(( ! defined $probe_id ) || $mismatch ) { + $probe_id = $current_probe_id++; } - $previous_seq_region_id = $seq_region_id; - $previous_seq_region_start = $seq_region_start; - ($affy_name,$affy_probeset,$probename) = split /:/,$value,3; - $affy_probe->{$misc_set_id} = $probename; - $probe_feature->{'seq_region_id'} = $seq_region_id; - $probe_feature->{'seq_region_start'} = $seq_region_start; - $probe_feature->{'seq_region_end'} = $seq_region_end; - $probe_feature->{'seq_region_strand'} = $seq_region_strand; - $probe_feature->{'mismatches'} = $mismatch; - } + + print PROBE join( "\t", $probe_id, + $affy_array->{$misc_set_id}, + $affy_probeset, + $probename),"\n"; + $stored_probes{ $complete_probename } = $probe_id; + } + # at this point the probe_id is correct, it might already be clear that the + # feature doesnt need storing (there is already a merge cache entry for + # this position. + + + # do we want to store the feature ? + # if its already stored with that probe_id its in the + # merge_cache no addition feature is needed + if( $mismatch ) { + $merge_key = join( "-", $probe_id, $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand ); + } + + if( exists $merge_cache{ $merge_key } ) { + # this one is already stored + } else { + $merge_cache{ $merge_key } = $probe_id; + print FEATURE join ("\t",$seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, + $mismatch, $probe_id ),"\n"; + } + } + $sth->finish(); - close FEATURE; - close PROBE; -#and finally import the information - print STDERR "Loading new affy information\n"; - load($dbCore,"$tmp_dir/affy_feature_$$\.txt",qw(affy_feature seq_region_id seq_region_start seq_region_end seq_region_strand mismatches affy_probe_id)); - load($dbCore,"$tmp_dir/affy_probe_$$\.txt",qw(affy_probe affy_probe_id affy_array_id probeset name)); + } + + close FEATURE; + close PROBE; + + #and finally import the information + print STDERR "Loading new affy information\n"; + load($dbCore,"$tmp_dir/affy_feature_$$\.txt",qw(affy_feature seq_region_id seq_region_start seq_region_end seq_region_strand mismatches affy_probe_id)); + load($dbCore,"$tmp_dir/affy_probe_$$\.txt",qw(affy_probe affy_probe_id affy_array_id probeset name)); } @@ -135,7 +205,7 @@ sub load{ my $cols = join( ",", @colnames ); my $sql = qq{ - LOAD DATA LOCAL INFILE '$file' + LOAD DATA INFILE '$file' INTO TABLE $tablename ($cols) }; $dbCore->dbc()->do($sql); -- GitLab