From e40c7ca02bd664f918e2df3b2fba458396afd836 Mon Sep 17 00:00:00 2001
From: Arne Stabenau <stabenau@sanger.ac.uk>
Date: Wed, 8 Dec 2004 10:34:48 +0000
Subject: [PATCH] essentially rewritten, should be useless soon thanks to Vivek
 and Craig

---
 sql/transfer_misc_affy.pl | 194 ++++++++++++++++++++++++++------------
 1 file changed, 132 insertions(+), 62 deletions(-)

diff --git a/sql/transfer_misc_affy.pl b/sql/transfer_misc_affy.pl
index e4f515acaa..fa6a489d86 100644
--- a/sql/transfer_misc_affy.pl
+++ b/sql/transfer_misc_affy.pl
@@ -10,13 +10,21 @@ my $affy_array; #hash containing the relation between misc_set_id => affy_array_
 my $probe_feature; #hash containing the information relevant to the affy_feature table
 my $affy_probe; #hash containing the information relevant to the affy_probe table: misc_set_id -> [probeset,probename]
 my $affy_probe_id = 1; #first affy_probe_id in the affy_probe table
+
+# unbeknown to me there are actually probesets that contain the same probe twice.
+# for the conversion we have to take them out, it will work better when the features
+# and probes are generate from the fasta files directly
+
+my %kill_probeset = ( '892_at' => 1 );
+
+
 {
     my ($chost, $cuser, $cpass, $cport, $cdbname); #ensembl core db
-    GetOptions('chost=s'   => \$chost,
-	       'cuser=s'   => \$cuser,
-	       'cpass=s'   => \$cpass,
-	       'cport=i'   => \$cport,
-	       'cdbname=s' => \$cdbname,
+    GetOptions('host=s'   => \$chost,
+	       'user=s'   => \$cuser,
+	       'pass=s'   => \$cpass,
+	       'port=i'   => \$cport,
+	       'dbname=s' => \$cdbname,
 	       'tmpdir=s' => \$tmp_dir
 	       );
     #by default, connect to the stagging server at ecs2:3364
@@ -62,68 +70,130 @@ sub populate_affy_array{
 }
 
 sub populate_probe_info{
-    my $dbCore = shift;
+  my $dbCore = shift;
+
+  my ($seq_region_id, $seq_region_start, $seq_region_end, $seq_region_strand, 
+      $misc_set_id, $complete_probename, $mismatch);
+
+  my ($affy_name, $affy_probeset, $probename);
 
-    my ($seq_region_id, $seq_region_start, $seq_region_end, $seq_region_strand, $misc_set_id, $value, $mismatch);
-    my ($affy_name, $affy_probeset, $probename); #attributes in the value field for an attrib_type_id = 5
-    my $previous_seq_region_id = 0;
-    my $previous_seq_region_start = 0;
-    my $old_affy_probe;
-    my $probe_set;
-    print STDERR "Going to get affy information....\n";
-    my $sth = $dbCore->dbc()->prepare(qq{SELECT STRAIGHT_JOIN seq_region_id, seq_region_start, seq_region_end, seq_region_strand, mff.misc_set_id, ma1.value, (IF (ma2.value = 'Mismatch',1,0)) as mismatch
-				      FROM misc_feature mf, misc_attrib ma1, misc_attrib ma2, misc_feature_misc_set mff, attrib_type at1, attrib_type at2, misc_set ms
-				      WHERE ma2.misc_feature_id = mf.misc_feature_id 
-				      AND ma2.attrib_type_id = at2.attrib_type_id 
-				      AND at2.code = 'matchStatus' 
-				      AND ma1.attrib_type_id = at1.attrib_type_id 
-				      AND at1.code = 'probeName'
-				      AND ma1.misc_feature_id = mf.misc_feature_id 
-				      AND mf.misc_feature_id = mff.misc_feature_id 
-				      AND ms.misc_set_id = mff.misc_set_id 
-				      AND ms.code <> 'All_Affy'
-				      ORDER BY seq_region_id,seq_region_start
-				  });
+  my $previous_seq_region_id = -1;
+  my $previous_seq_region_start = -1;
+  my $old_affy_probe;
+  my $probe_set;
+  print STDERR "Going to get affy information....\n";
+
+
+  my $sql = 
+    qq{
+	SELECT STRAIGHT_JOIN seq_region_id, seq_region_start, seq_region_end, 
+	                     seq_region_strand, mff.misc_set_id, ma1.value, 
+                             (IF (ma2.value = 'Mismatch',1,0)) as mismatch
+			  FROM misc_feature mf, misc_attrib ma1, misc_attrib ma2, 
+                               misc_feature_misc_set mff, attrib_type at1, 
+                               attrib_type at2, misc_set ms
+			 WHERE ma2.misc_feature_id = mf.misc_feature_id 
+			   AND ma2.attrib_type_id = at2.attrib_type_id 
+			   AND at2.code = 'matchStatus' 
+			   AND ma1.attrib_type_id = at1.attrib_type_id 
+			   AND at1.code = 'probeName'
+			   AND ma1.misc_feature_id = mf.misc_feature_id 
+			   AND mf.misc_feature_id = mff.misc_feature_id 
+	                   AND ms.misc_set_id = mff.misc_set_id 
+	                   AND ms.code <> 'All_Affy'
+                      ORDER BY seq_region_id,seq_region_start };
+
+
+  print STDERR "Ready to create affy files\n";
+  open FEATURE, ">$tmp_dir/affy_feature_$$\.txt";
+  open PROBE, ">$tmp_dir/affy_probe_$$\.txt";
+
+  my %stored_probes;
+  my $current_probe_id = 1;
+  my %merge_cache;
+  my $merge_key;
+
+  # merging of probes is only allowes for mismatch = 0
+  for my $mismatch_process ( 0..1 ) {
+    my $sth = $dbCore->dbc()->prepare( $sql );
     $sth->{mysql_use_result} = 1;
     $sth->execute();
-    $sth->bind_columns(\$seq_region_id, \$seq_region_start, \$seq_region_end, \$seq_region_strand, \$misc_set_id, \$value, \$mismatch);
-    print STDERR "Ready to create affy files\n";
-    open FEATURE, ">$tmp_dir/affy_feature_$$\.txt";
-    open PROBE, ">$tmp_dir/affy_probe_$$\.txt";
+    $sth->bind_columns(\$seq_region_id, \$seq_region_start, \$seq_region_end, 
+		       \$seq_region_strand, 
+		       \$misc_set_id, \$complete_probename, \$mismatch);
+  
+  
+    my $prev_seq_region_id = -1;
+    my $prev_start = -1;
+
     while($sth->fetch()){
-	#we have a new probe, add the previous one to the database, and flush the structures
-	unless ((($previous_seq_region_id == $seq_region_id) or ($previous_seq_region_id == 0)) and (($previous_seq_region_start == $seq_region_start) or ($previous_seq_region_start == 0))){
-	    foreach my $key (keys %{$affy_probe}){
-		if (!exists $probe_set->{$affy_probeset . ":" . $affy_probe->{$key}}){
-		    $probe_set->{$affy_probeset . ":" . $affy_probe->{$key}} = $affy_probe_id;
-		    print PROBE join ("\t",$affy_probe_id,$affy_array->{$key},$affy_probeset,$affy_probe->{$key}),"\n";
-		}
-		$old_affy_probe = $probe_set->{$affy_probeset . ":" . $affy_probe->{$key}};
-	    }
-	    #insert all the affy_probe values in the file	    
-	    print FEATURE join ("\t",$probe_feature->{'seq_region_id'},$probe_feature->{'seq_region_start'},$probe_feature->{'seq_region_end'}, $probe_feature->{'seq_region_strand'},$probe_feature->{'mismatches'},$old_affy_probe),"\n";
-	    $affy_probeset = '';
-	    $affy_probe_id++;
-	    $affy_probe = ();
-	    $probe_feature = ();
+
+      next unless( $mismatch == $mismatch_process );
+
+      # flush the merge cache regularly
+      if( $prev_start != $seq_region_start ||
+	  $prev_seq_region_id != $seq_region_id ) {
+	%merge_cache = ();
+	$prev_start = $seq_region_start;
+	$prev_seq_region_id = $seq_region_id;
+      }
+
+      my ($affy_name,$affy_probeset,$probename) = split /:/,$complete_probename,3;
+      if( $kill_probeset{ $affy_probeset } ) { next; }
+
+      # first check wether we have to store probe information
+      my $probe_id = $stored_probes{ $complete_probename };
+      if( ! $mismatch ) {
+	$merge_key = join( "-", $seq_region_id, $seq_region_start, $seq_region_end,
+			   $seq_region_strand, $mismatch, $affy_probeset ); 
+      }
+
+      if( ! defined $probe_id ) {
+	# probe information needs to be stored, but new probe_id or existing one?
+	$probe_id = $merge_cache{ $merge_key };
+	if(( ! defined $probe_id ) || $mismatch ) {
+	  $probe_id = $current_probe_id++;
 	}
-	$previous_seq_region_id = $seq_region_id;
-	$previous_seq_region_start = $seq_region_start;
-	($affy_name,$affy_probeset,$probename) = split /:/,$value,3;
-	$affy_probe->{$misc_set_id} = $probename;
-	$probe_feature->{'seq_region_id'} = $seq_region_id;
-	$probe_feature->{'seq_region_start'} = $seq_region_start;
-	$probe_feature->{'seq_region_end'} = $seq_region_end;
-	$probe_feature->{'seq_region_strand'} = $seq_region_strand;
-	$probe_feature->{'mismatches'} = $mismatch;
-    }
+
+	print PROBE join( "\t", $probe_id, 
+			  $affy_array->{$misc_set_id},
+			  $affy_probeset,
+			  $probename),"\n";
+	$stored_probes{ $complete_probename } = $probe_id;
+      }
+      # at this point the probe_id is correct, it might already be clear that the 
+      # feature doesnt need storing (there is already a merge cache entry for 
+      # this position.
+
+
+      # do we want to store the feature ?
+      # if its already stored with that probe_id its in the 
+      # merge_cache no addition feature is needed
+      if( $mismatch ) {
+	$merge_key = join( "-", $probe_id, $seq_region_id, $seq_region_start, $seq_region_end,
+			   $seq_region_strand ); 
+      }
+
+      if( exists $merge_cache{ $merge_key } ) {
+	# this one is already stored
+      } else {
+	$merge_cache{ $merge_key } = $probe_id;
+	print FEATURE join ("\t",$seq_region_id, $seq_region_start,
+			    $seq_region_end, $seq_region_strand,
+			    $mismatch, $probe_id ),"\n";
+      }
+    } 
+
     $sth->finish();
-    close FEATURE;
-    close PROBE;
-#and finally import the information
-    print STDERR "Loading new affy information\n";
-    load($dbCore,"$tmp_dir/affy_feature_$$\.txt",qw(affy_feature seq_region_id seq_region_start seq_region_end seq_region_strand mismatches affy_probe_id));
-    load($dbCore,"$tmp_dir/affy_probe_$$\.txt",qw(affy_probe affy_probe_id affy_array_id probeset name));
+  }
+
+  close FEATURE;
+  close PROBE;
+
+  #and finally import the information
+  print STDERR "Loading new affy information\n";
+  load($dbCore,"$tmp_dir/affy_feature_$$\.txt",qw(affy_feature seq_region_id seq_region_start seq_region_end seq_region_strand mismatches affy_probe_id));
+  load($dbCore,"$tmp_dir/affy_probe_$$\.txt",qw(affy_probe affy_probe_id affy_array_id probeset name));
 }
 
 
@@ -135,7 +205,7 @@ sub load{
     
     my $cols = join( ",", @colnames );
     my $sql = qq{
-	LOAD DATA LOCAL INFILE '$file'
+	LOAD DATA INFILE '$file'
 	    INTO TABLE $tablename ($cols)
 	};
     $dbCore->dbc()->do($sql);
-- 
GitLab