test_assembly_mapping.pl 9.8 KB
Newer Older
1
#!/usr/bin/env perl
2
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3
4
5
6
7
8
9
10
11
12
13
14
15
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

=head1

test_assembly_mapping.pl [arguments]

Produce statistics on the quality of feature mappings between two different assemblies.
Feature mappings need to be loaded first using script load_feature_mappings.pl
Sample config file: test_assembly_mapping.ini.example

Required arguments:
  --host=HOST                 test database host HOST
  --port=PORT                 test database port PORT
  --user=USER                 test database username USER
  --pass=PASS                 test database passwort PASS
  --dbname=NAME               test database name NAME


Optional arguments:

  --conffile=filename     read parameters from FILE
                                        (default: conf/Conversion.ini)
  --logfile, --log=FILE               log to FILE (default: *STDOUT)
  --logpath=PATH                      write logfile to PATH (default: .)
  --logappend, --log_append           append to logfile (default: truncate)

=cut

use strict;
use warnings;
no warnings 'uninitialized';

use FindBin qw($Bin);
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
use Bio::EnsEMBL::DBSQL::DBAdaptor;


use vars qw($SERVERROOT);

BEGIN {
    $SERVERROOT = "$Bin/../";
}


$| = 1;

my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);

# parse options
$support->parse_common_options(@_);

$support->allowed_params(qw(dbname host port user pass conffile logfile logpath logappend));

if ($support->param('help') or $support->error) {
    warn $support->error if $support->error;
    pod2usage(1);
}

# ask user to confirm parameters to proceed
$support->confirm_params;

# get log filehandle and print heading and parameters to logfile
$support->init_log;

$support->check_required_params(qw(dbname host port user));

my ($dba, $dbh, $sql, $sth);
$dbh = $support->get_dbconnection();


my %mapping_quality = ( 1 => 'with gaps',
			2 => 'without gaps');

#get chromosomes

my @chrs = @{$dbh->selectcol_arrayref("select distinct old_chr from mapping order by old_chr+0")};

foreach my $chr (@chrs) {
 
  my @feature_types = @{$dbh->selectcol_arrayref("select distinct feature_type from mapping where old_chr = '". $chr . "' order by feature_type")};

  foreach my $feature_type (@feature_types) {
      $support->log_stamped("processing $feature_type mappings for chromosome $chr\n\n",1);


      #count all features in the old db
103
      $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "'" ;
104
105
106
      my ($total_features_old) = $dbh->selectrow_array($sql);

      #count all features both in the old and new db
107
      $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and feature_found_in_new_db = 1 and old_chr = '". $chr . "'";
108
109
110
111
112
113
114
115
      my ($total_features_old_new) = $dbh->selectrow_array($sql);

      my $perc_features = ($total_features_old_new/$total_features_old) * 100;
      $perc_features = sprintf "%.2f", $perc_features;

      $support->log_stamped("$total_features_old_new $feature_type" . "s found in both old and new assembly dbs ($perc_features\% in old assembly db found in new db)\n\n", 1);

      #count features which have the same length in the new db
116
      $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and  feature_found_in_new_db = 1 and old_length = new_length and old_chr = '". $chr . "'";
117
118
119
120
121
122
123
124
125
126
127

      my ($total_same_length) = $dbh->selectrow_array($sql);

      my $ok_mappings_1 = 0;
      my $perc_mappings_1 = 0;

      if ($total_same_length > 0) {
	  #count mappings where start and end match feature location in the new db by mapping quality (1-contains gaps, 2 - no gaps

	  $support->log_stamped("$total_same_length $feature_type" . "s where feature length in the old assembly db is the same as in the new assembly db\n\n", 2);

128
	  $sql = "select count(1) as mapping_count,mapping_quality from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "' and  feature_found_in_new_db = 1 and old_length = new_length and new_start = mapping_start and new_end = mapping_end and (new_strand = mapping_strands or mapping_strands = 'both') and (mapping_chrs = new_chr or mapping_chrs like concat(new_chr,',%')) group by mapping_quality";
129
130
131
132
133
134
135
136
137

	  my %mapping_counts = %{$dbh->selectall_hashref($sql,"mapping_quality")};
	  
	  my $ok_mappings = 0;

	  foreach my $mapping_q (keys %mapping_counts) {
		   my $perc = ($mapping_counts{$mapping_q}{'mapping_count'}/$total_same_length) * 100;
		   $perc = sprintf "%.2f", $perc;

138
		   $support->log_stamped("$mapping_counts{$mapping_q}{'mapping_count'} $feature_type mappings $mapping_quality{$mapping_q} where feature length in the old assembly db is the same as in the new assembly db and mapping start and end match feature location in the new db ($perc\%)\n\n", 3);
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
		  
		   $ok_mappings += $mapping_counts{$mapping_q}{'mapping_count'};
		   
	  }

	  $ok_mappings_1 += $ok_mappings;
	  my $perc_mappings = ($ok_mappings/$total_same_length) * 100;
	  $perc_mappings = sprintf "%.2f", $perc_mappings;

	  $support->log_stamped("total: $ok_mappings ok $feature_type mappings ($perc_mappings\%):\n\n", 3);

      } else {
	  $support->log_stamped("no $feature_type" . "s found for chromosome $chr where feature length in the old assembly db is the same as in the new assembly db\n\n", 2);
      }
   
154
      $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "' and feature_found_in_new_db = 1 and old_length != new_length";
155
156
157
158
159
160
161
162
163

      my ($total_diff_length) = $dbh->selectrow_array($sql);

      if ($total_diff_length > 0) {      

	  $support->log_stamped("$total_diff_length $feature_type" . "s found for chromosome $chr where feature length in the old assembly db is different than in the new assembly db\n\n", 2);

	  #count features with different length in the new db where either mapping start or end match new feature start or end respectively, by mapping quality
	  
164
	  $sql = "select count(1) as mapping_count,mapping_quality from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "'  and  feature_found_in_new_db = 1 and old_length != new_length and (new_start = mapping_start or new_end = mapping_end) and (new_strand = mapping_strands or mapping_strands = 'both') and (mapping_chrs = new_chr or mapping_chrs like concat(new_chr,',%')) group by mapping_quality";
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

	  my %mapping_counts = %{$dbh->selectall_hashref($sql,"mapping_quality")};

	  my $ok_mappings = 0;
	 
	  foreach my $mapping_q (keys %mapping_counts) {
	       my $perc = ($mapping_counts{$mapping_q}{'mapping_count'}/$total_diff_length) * 100;
	       $perc = sprintf "%.2f", $perc;

	       $support->log_stamped("$mapping_counts{$mapping_q}{'mapping_count'} $feature_type mappings $mapping_quality{$mapping_q} found for chromosome where feature length in the old assembly db is different than in the new assembly db and either mapping start or end matches the gene location in the new db ($perc\%)\n\n", 3);

	       $ok_mappings += $mapping_counts{$mapping_q}{'mapping_count'};
	       
	  }

	  #count features with different length in the new db where either mapping start or end is within the difference between old and new length from the new feature start or end respectively, by mapping quality
181
	  $sql = "select count(1) as mapping_count,mapping_quality from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "' and  feature_found_in_new_db = 1 and old_length != new_length and (abs(new_start - mapping_start) <= abs(new_length - old_length) or abs(new_end - mapping_end) <= abs(new_length - old_length)) and new_start != mapping_start and new_end != mapping_end and (new_strand = mapping_strands or mapping_strands = 'both') and (mapping_chrs = new_chr or mapping_chrs like concat(new_chr,',%')) group by mapping_quality";
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

	  %mapping_counts = %{$dbh->selectall_hashref($sql,"mapping_quality")};


	  foreach my $mapping_q (keys %mapping_counts) {
	       my $perc = ($mapping_counts{$mapping_q}{'mapping_count'}/$total_diff_length) * 100;
	       $perc = sprintf "%.2f", $perc;

	       $support->log_stamped("$mapping_counts{$mapping_q}{'mapping_count'} $feature_type mappings $mapping_quality{$mapping_q} found for chromosome where feature length in the old assembly db is different than in the new assembly db and either mapping start or end is within the difference between old and new feature length from the new feature start or end, respectively ($perc\%)\n\n", 3);
	       $ok_mappings += $mapping_counts{$mapping_q}{'mapping_count'};
	   
	  }

	  my $perc_mappings = ($ok_mappings/$total_diff_length) * 100;
	  $perc_mappings = sprintf "%.2f", $perc_mappings;
	  $support->log_stamped("total: $ok_mappings ok $feature_type mappings ($perc_mappings\%):\n\n", 3);

	  $ok_mappings_1 += $ok_mappings;	  

      }	 else {
	  $support->log_stamped("no $feature_type" . "s found for chromosome $chr where feature length in the old assembly db is different than in the new assembly db\n\n", 2);
      } 

      $perc_mappings_1 = ($ok_mappings_1/$total_features_old_new) * 100;
      $perc_mappings_1 = sprintf "%.2f", $perc_mappings_1;
      $support->log_stamped("total: $ok_mappings_1 ok $feature_type mappings ($perc_mappings_1\%):\n\n", 1);
      

  }
 
}

# finish logfile
$support->finish_log;