00001 #include "MAl_addon.h"
00002 #include "trapperDNP.h"
00003 #include "dnp_types.h"
00004 #include <cassert>
00005
00006 #include <cmath>
00007
00008 DNPclass::DNPclass( MAl_addon* mal, int minqual, int d_min, double p_max ) :
00009 m(mal), num_candidates(0), qualmin(minqual), Dmin(d_min), p_tot_max(p_max)
00010
00011
00012 {
00013
00014
00015
00016 for( int q = 0; q < 100; q++ ) {
00017
00018 quality2error.push_back( std::pow(10.0, -(double)q/10.0) );
00019
00020 }
00021 base_trans['a'] = 'a';
00022 base_trans['A'] = 'a';
00023
00024 base_trans['t'] = 't';
00025 base_trans['T'] = 't';
00026
00027 base_trans['g'] = 'g';
00028 base_trans['G'] = 'g';
00029
00030 base_trans['c'] = 'c';
00031 base_trans['C'] = 'c';
00032
00033 base_trans['n'] = 'n';
00034 base_trans['N'] = 'n';
00035
00036 base_trans['x'] = 'x';
00037 base_trans['X'] = 'x';
00038
00039 base_trans['-'] = '-';
00040
00041 base_trans['*'] = '*';
00042
00043 cerr<<"Finding candidate positions... ";
00044 find_DNP_candidate_positions();
00045 cerr<<"Done"<<endl;
00046 cerr<<"Evaluating candidates... ";
00047 evaluate_DNP_candidate_positions();
00048 cerr<<"Done"<<endl;
00049 cerr<<num_found<<" DNPs found at "<<num_pos<<" positions"<<endl;
00050 }
00051
00052
00053 DNPclass::~DNPclass()
00054
00055
00056 {
00057
00058 }
00059
00060
00061
00062 void DNPclass::find_DNP_candidate_positions()
00063
00064
00065 {
00066
00067
00068 int min = m->get_min_col();
00069 int max = m->get_max_col();
00070
00071
00072
00073 for( int i = min; i < max; i++ ) {
00074 if ( column_is_candidate(i) ) {
00075 cand_vec.push_back(i);
00076 }
00077 int proc = 100 * i / max;
00078
00079
00080 }
00081 assert( cand_vec.size() == cand_bases.size() );
00082 num_candidates += (int)cand_vec.size();
00083 }
00084
00085
00086 int DNPclass::column_is_candidate(int col)
00087
00088
00089 {
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102 char cons = base_trans [ m->get_most_freq_base_on_col( col ) ];
00103 if ( cons != 'a' && cons != 'c' &&
00104 cons != 'g' && cons != 't' ) return 0;
00105
00106
00107
00108 char base_arr[] = {'a', 'c', 'g', 't'};
00109 int cand_num(0);
00110 int cand_base = 'u';
00111 int cand_found(0);
00112
00113 for( int i = 0; i < 4; i++ ) {
00114 char base = base_trans [ base_arr[i] ];
00115 if ( base == cons ) continue;
00116
00117 if ( m->get_num_bases_on_col(col, base) >= Dmin ) {
00118 if ( m->get_num_bases_on_col(col, base) > cand_num ) {
00119 cand_num = m->get_num_bases_on_col(col, base);
00120 cand_base = base;
00121 cand_found = 1;
00122 }
00123 }
00124 }
00125
00126
00127 if ( cand_found ) {
00128 cand_bases.push_back( cand_base );
00129 cons_bases.push_back( cons );
00130 return 1;
00131 }
00132 return 0;
00133
00134 }
00135
00136
00137
00138 void DNPclass::evaluate_DNP_candidate_positions()
00139
00140
00141
00142 {
00143
00144
00145
00146
00147 num_found = 0;
00148 num_pos = 0;
00149
00150 assert( (int)confirmed_DNP_pos.size() == 0 );
00151 for( int i = 0; i < m->get_max_col(); i++ ) {
00152 confirmed_DNP_pos.push_back(0);
00153 }
00154
00155 vector<char> first_base_col, second_base_col;
00156 vector<int> first_qual_col, second_qual_col;
00157 vector<int> rownum_vec;
00158
00159
00160 for( int i = 0; i < (int)cand_vec.size() - 1; i++ ) {
00161 int proc = 100 * i / cand_vec.size();
00162
00163
00164 for( int j = i+1; j < (int)cand_vec.size(); j++ ) {
00165
00166 pick_base_row_and_qual_columns(cand_vec[i], cand_vec[j], first_base_col,
00167 second_base_col, first_qual_col, second_qual_col,
00168 rownum_vec);
00169
00170 if ( (int)first_base_col.size() < Dmin*2 ) {
00171
00172
00173 break;
00174 }
00175 char cand_base_1 = cand_bases[i];
00176 char cand_base_2 = cand_bases[j];
00177
00178 int num_cand_1(0), num_cand_2(0);
00179 double exp_err_first(0.0), exp_err_second(0.0);
00180 int num_corr(0);
00181 double expectation_val(0.0);
00182 look_for_corr(first_base_col, second_base_col, first_qual_col,
00183 second_qual_col, cand_base_1, cand_base_2,
00184 num_cand_1, num_cand_2, exp_err_first, exp_err_second,
00185 num_corr, expectation_val);
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195 if ( num_cand_1 < Dmin ) {
00196
00197 break;
00198 }
00199 int DNPfound(0);
00200 double tot_prob=0;
00201 double col_prob=0;
00202 double p1(-1), p2(-1), corr_prob(-1);
00203
00204 if ( num_corr >= Dmin ) {
00205
00206
00207 p1 = poisson_probability(exp_err_first, num_cand_1);
00208 p2 = poisson_probability(exp_err_second, num_cand_1);
00209 col_prob = 1.0 - (1.0 - p1) * (1.0 - p2);
00210
00211 assert( col_prob >= 0.0 && col_prob <= 1.0);
00212
00213 corr_prob = poisson_probability(expectation_val, num_corr);
00214 tot_prob = col_prob * corr_prob;
00215
00216
00217 if (tot_prob <= p_tot_max || num_corr > Dmin )DNPfound = 1;
00218
00219 }
00220
00221
00222 if ( DNPfound ) {
00223
00224 if ( !confirmed_DNP_pos[ cand_vec[i] ] ||
00225 !confirmed_DNP_pos[ cand_vec[j] ]) {
00226 num_pos++;
00227 }
00228
00229
00230 assert( cand_vec[i] < (int)confirmed_DNP_pos.size() );
00231 assert( cand_vec[j] < (int)confirmed_DNP_pos.size() );
00232
00233 confirmed_DNP_pos[ cand_vec[i] ] = 1;
00234 confirmed_DNP_pos[ cand_vec[j] ] = 1;
00235 set_DNPs( cand_vec[i], cand_vec[j],
00236 cand_bases[i], cand_bases[j], rownum_vec,
00237 i, j, cons_bases[i], cons_bases[j]);
00238 }
00239
00240 }
00241 }
00242 }
00243
00244
00245
00246
00247 void DNPclass::set_DNPs( int col1, int col2, char DNPbase1, char DNPbase2,
00248 vector<int> &rownum_vec, int ID1, int ID2,
00249 char cons1, char cons2)
00250
00251
00252 {
00253
00254 for( int i = 0; i < (int)rownum_vec.size(); i++ ) {
00255
00256 if ( m->get_entry( rownum_vec[i], col1 ) == DNPbase1 &&
00257 m->get_entry( rownum_vec[i], col2 ) == DNPbase2 &&
00258 m->get_qual(rownum_vec[i], col1) >= qualmin &&
00259 m->get_qual(rownum_vec[i], col2) >= qualmin) {
00260
00261 if ( !m->is_DNP( rownum_vec[i] , col1 ) ) {
00262 num_found++;
00263 }
00264 if ( !m->is_DNP( rownum_vec[i] , col2 ) ) {
00265 num_found++;
00266 }
00267
00268
00269 m->set_DNP( rownum_vec[i] , col1, true, ID1, dnp_type(DNPbase1, cons1) );
00270 m->set_DNP( rownum_vec[i] , col2, true, ID2, dnp_type(DNPbase2, cons2));
00271
00272
00273 }
00274 }
00275 }
00276
00277
00278
00279 void DNPclass::look_for_corr( vector<char> &first_col, vector<char> &second_col,
00280 vector<int> &first_qual_col, vector<int> &second_qual_col,
00281 char cand_base_1, char cand_base_2, int &num_err_first, int &num_err_second,
00282 double &exp_err_first, double &exp_err_second, int &num_corr,
00283 double &expectation_val)
00284
00285
00286 {
00287
00288
00289
00290
00291
00292
00293
00294
00295 num_err_first = num_err_second = 0;
00296 exp_err_first = exp_err_second = 0.0;
00297 expectation_val = 0;
00298 num_corr = 0;
00299
00300
00301
00302 for( int i = 0; i < (int)first_col.size(); i++ ) {
00303 assert( first_qual_col[i] > -1 && first_qual_col[i] < 100 );
00304 assert( second_qual_col[i] > -1 && second_qual_col[i] < 100 );
00305
00306 int no_1_cand(0), no_2_cand(0);
00307
00308 exp_err_first += quality2error[ first_qual_col[i] ];
00309 exp_err_second += quality2error[ second_qual_col[i] ];
00310
00311
00312 if ( first_col[i] == cand_base_1 && first_qual_col[i] >= qualmin ) {
00313 num_err_first++;
00314 no_1_cand = 1;
00315 }
00316 if ( second_col[i] == cand_base_2 && second_qual_col[i] >= qualmin ) {
00317 num_err_second++;
00318 no_2_cand = 1;
00319 }
00320 if ( no_1_cand && no_2_cand )
00321 num_corr++;
00322 }
00323
00324
00325
00326
00327 for( int i = 0; i < (int)first_col.size(); i++ ) {
00328 double err1 = quality2error[ first_qual_col[i] ];
00329 double err2 = quality2error[ second_qual_col[i] ];
00330 double new_sum = exp_err_first - err1;
00331 double Exp1 = num_err_first * err1 / ( num_err_first * err1 + new_sum * (1.0 - err1) );
00332 new_sum = exp_err_second - err2;
00333 double Exp2 = num_err_second * err2 / ( num_err_second * err2 + new_sum * (1.0 - err2) );
00334
00335 expectation_val += Exp1 * Exp2;
00336 }
00337
00338 }
00339
00340
00341 double DNPclass::poisson_probability(double lambda, int obs)
00342
00343
00344 {
00345
00346
00347
00348 assert( lambda >= 0.0);
00349 assert( obs >= 0 );
00350 double M = lambda;
00351 int N_in = obs;
00352 int N = N_in - 1;
00353 int J (0);
00354 double S = exp(-M);
00355 double res(0.0);
00356
00357 if(N < 0){
00358 return 1.0;
00359 }
00360
00361
00362 if(S==0){
00363 J = N;
00364 if(J > M){
00365 J = int(M);
00366 }
00367
00368 double F(0.0);
00369 for( int i = 1; i <= J; i++ ) {
00370 F += log((double)i);
00371 }
00372 double L = -M-F+J*log(M);
00373 L = exp(L);
00374 F=1;
00375
00376 double T(0.0);
00377
00378 for( int i = 1; i <= J; i++ ) {
00379 F *= (J+1-i)/M;
00380 T += F;
00381 }
00382
00383 if(J==N){
00384 T=(T+1)*L;
00385 res = 1.0 - T;
00386
00387 }
00388 else{
00389 F=1;
00390 for( int i = 1; i <= N-J; i++ ) {
00391 F *= M/(i+J);
00392 T=T+F;
00393 }
00394 T=(T+1)*L;
00395 res = (1.0 - T);
00396 }
00397
00398 }
00399 else{
00400
00401 double T=S;
00402 for( int i = 0; i <= N-1; i++ ) {
00403 S = S*M/(i+1);
00404 T += S;
00405 }
00406 res = (1.0 - T);
00407 }
00408
00409 if ( res < 0.0 ) {
00410 if ( -res < 0.0000000000001 )
00411 res = 0.0;
00412 }
00413 else if ( res > 1.0 ) {
00414 if ( res - 1.0 < 0.0000000000001 )
00415 res = 1.0;
00416 }
00417 assert( res <= 1.0 );
00418 assert( res >= 0.0 );
00419
00420 return res;
00421 }
00422
00423
00424 void DNPclass::pick_base_row_and_qual_columns(int firstDNPindex, int secondDNPindex,
00425 vector<char> &firstDNPvec, vector<char> &secondDNPvec,
00426 vector<int> &first_qual_vec, vector<int> &second_qual_vec,
00427 vector<int> &rownum_vec)
00428
00429
00430 {
00431 firstDNPvec.erase( firstDNPvec.begin(), firstDNPvec.end() );
00432 secondDNPvec.erase( secondDNPvec.begin(), secondDNPvec.end() );
00433 first_qual_vec.erase( first_qual_vec.begin(), first_qual_vec.end() );
00434 second_qual_vec.erase( second_qual_vec.begin(), second_qual_vec.end() );
00435 rownum_vec.erase( rownum_vec.begin(), rownum_vec.end() );
00436
00437 for( int j = 0; j < m->get_num_rows(); j++ ) {
00438
00439 if ( firstDNPindex >= m->get_row_end(j) || firstDNPindex < m->get_row_start(j) )
00440 continue;
00441 if ( secondDNPindex >= m->get_row_end(j) || secondDNPindex < m->get_row_start(j) )
00442 continue;
00443
00444
00445
00446 char baseOnPos1 = base_trans[ m->get_entry(j, firstDNPindex) ];
00447 char baseOnPos2 = base_trans[ m->get_entry(j, secondDNPindex) ];
00448
00449
00450 if(baseOnPos1 == 'x' || baseOnPos2 == 'x')
00451 continue;
00452
00453 assert(baseOnPos1 == 'a' || baseOnPos1 == 't' || baseOnPos1 == 'g' ||
00454 baseOnPos1 == 'c' || baseOnPos1 == 'n' || baseOnPos1 == '-' ||
00455 baseOnPos1 == 'x' || baseOnPos1 == '*' );
00456 assert(baseOnPos2 == 'a' || baseOnPos2 == 't' || baseOnPos2 == 'g' ||
00457 baseOnPos2 == 'c' || baseOnPos2 == 'n' || baseOnPos2 == '-' ||
00458 baseOnPos2 == 'x' || baseOnPos2 == '*');
00459
00460 int qualOnPos1 = m->get_qual(j, firstDNPindex);
00461 int qualOnPos2 = m->get_qual(j, secondDNPindex);
00462
00463 firstDNPvec.push_back(baseOnPos1);
00464 secondDNPvec.push_back(baseOnPos2);
00465
00466 first_qual_vec.push_back(qualOnPos1);
00467 second_qual_vec.push_back(qualOnPos2);
00468
00469 assert( j < m->get_num_rows() );
00470
00471 rownum_vec.push_back( j );
00472 }
00473 }
00474
00475