trapper: trapperDNP.cc Source File

00001 #include "MAl_addon.h"
00002 #include "trapperDNP.h"
00003 #include "dnp_types.h"
00004 #include <cassert>
00005 
00006 #include <cmath>
00007 
00008 DNPclass::DNPclass( MAl_addon* mal, int minqual, int d_min, double p_max ) : 
00009   m(mal), num_candidates(0), qualmin(minqual), Dmin(d_min), p_tot_max(p_max)
00010   // {{{ 
00011 
00012 {
00013   
00014   // Fill in the 'quality2error vector'
00015 
00016   for( int q = 0; q < 100; q++ ) {
00017     
00018     quality2error.push_back( std::pow(10.0, -(double)q/10.0)  );
00019     
00020   }
00021   base_trans['a'] = 'a';
00022   base_trans['A'] = 'a';
00023 
00024   base_trans['t'] = 't';
00025   base_trans['T'] = 't';
00026   
00027   base_trans['g'] = 'g';
00028   base_trans['G'] = 'g';
00029 
00030   base_trans['c'] = 'c';
00031   base_trans['C'] = 'c';
00032   
00033   base_trans['n'] = 'n';
00034   base_trans['N'] = 'n';
00035 
00036   base_trans['x'] = 'x';
00037   base_trans['X'] = 'x';
00038   
00039   base_trans['-'] = '-';
00040 
00041   base_trans['*'] = '*';
00042 
00043   cerr<<"Finding candidate positions...  ";
00044   find_DNP_candidate_positions();
00045   cerr<<"Done"<<endl;
00046   cerr<<"Evaluating candidates... ";
00047   evaluate_DNP_candidate_positions();
00048   cerr<<"Done"<<endl;
00049   cerr<<num_found<<" DNPs found at "<<num_pos<<" positions"<<endl;
00050 }
00051 
00052 // }}}
00053 DNPclass::~DNPclass()
00054   // {{{ 
00055 
00056 {
00057 
00058 }
00059 
00060 // }}}
00061 //**************************************************************
00062 void DNPclass::find_DNP_candidate_positions()
00063   // {{{ 
00064 
00065 {
00066   
00067   
00068   int min = m->get_min_col();
00069   int max = m->get_max_col();
00070   
00071   
00072   
00073   for( int i = min; i < max; i++ ) {
00074     if ( column_is_candidate(i) ) {
00075       cand_vec.push_back(i);
00076     }
00077     int proc = 100 * i / max;
00078 //     if ( proc % 5 == 0 )
00079 //       cerr<<proc<<" % checked"<<endl;
00080   }
00081   assert( cand_vec.size() == cand_bases.size() );
00082   num_candidates += (int)cand_vec.size();
00083 }
00084 
00085 // }}}
00086 int DNPclass::column_is_candidate(int col)
00087   // {{{ 
00088 
00089 {
00090   /* 
00091      Definition of candidate base type:
00092      
00093      ¤ Must be present in at least Dmin reads
00094      ¤ Can be present in at most half the number of reads on column
00095      
00096      ¤ Extra: at present there can be only one candidate base type 
00097      on a column. Without this constraint, a more involved approach 
00098      has to be taken in order to identify candidate base types.
00099      
00100   */
00101  
00102   char cons = base_trans [ m->get_most_freq_base_on_col( col ) ];
00103   if ( cons != 'a' && cons != 'c' && 
00104        cons != 'g' && cons != 't'  ) return 0;
00105   
00106 
00107 
00108   char base_arr[] = {'a', 'c', 'g', 't'/*, '-'*/};
00109   int cand_num(0);
00110   int cand_base = 'u';
00111   int cand_found(0);
00112   
00113   for( int i = 0; i < 4; i++ ) {
00114     char base = base_trans [ base_arr[i] ];
00115     if ( base == cons /*|| base != m->get_entry( 0, col )*/ /*NOTA BENE*/) continue;
00116     
00117     if ( m->get_num_bases_on_col(col, base) >= Dmin ) {
00118       if ( m->get_num_bases_on_col(col, base) > cand_num ) {
00119         cand_num = m->get_num_bases_on_col(col, base);
00120         cand_base = base;
00121         cand_found = 1;
00122       }
00123     }
00124   }
00125 
00126   
00127   if ( cand_found ) {
00128     cand_bases.push_back( cand_base );
00129     cons_bases.push_back( cons );
00130     return 1;
00131   }
00132   return 0;
00133   
00134 }
00135 
00136 // }}}
00137 
00138 void DNPclass::evaluate_DNP_candidate_positions()
00139 
00140   // {{{ 
00141 
00142 {
00143   /*
00144     Tentative try: compare all candidate columns to each other
00145   
00146   */
00147   num_found = 0;
00148   num_pos = 0;
00149   
00150   assert( (int)confirmed_DNP_pos.size() == 0 );
00151   for( int i = 0; i < m->get_max_col(); i++ ) {
00152     confirmed_DNP_pos.push_back(0);
00153   }
00154 
00155   vector<char> first_base_col, second_base_col;
00156   vector<int> first_qual_col, second_qual_col;
00157   vector<int> rownum_vec;
00158 
00159   
00160   for( int i = 0; i < (int)cand_vec.size() - 1; i++ ) {
00161     int proc = 100 * i / cand_vec.size();
00162 //     if ( proc % 5 == 0 )
00163 //       cerr<<proc<<" % checked"<<endl;
00164     for( int j = i+1; j < (int)cand_vec.size(); j++ ) {
00165       
00166       pick_base_row_and_qual_columns(cand_vec[i], cand_vec[j], first_base_col, 
00167                                      second_base_col, first_qual_col, second_qual_col, 
00168                                      rownum_vec);
00169       
00170       if ( (int)first_base_col.size() < Dmin*2 ) {//Need sufficient number of bases on columns
00171         //cerr<<"Size too small! i: "<<i<<", j: "<<j<<endl;
00172         //cerr<<"cand_vec[i]: "<<cand_vec[i]<<", cand_vec[j]: "<<cand_vec[j]<<endl;
00173         break;
00174       }
00175       char cand_base_1 = cand_bases[i];
00176       char cand_base_2 = cand_bases[j];
00177       
00178       int num_cand_1(0), num_cand_2(0);
00179       double exp_err_first(0.0), exp_err_second(0.0);
00180       int num_corr(0);
00181       double expectation_val(0.0);
00182       look_for_corr(first_base_col, second_base_col, first_qual_col, 
00183                     second_qual_col, cand_base_1, cand_base_2, 
00184                     num_cand_1, num_cand_2, exp_err_first, exp_err_second,
00185                     num_corr, expectation_val);
00186       
00187 //       double kvot;
00188 //       if ( num_cand_1 > num_cand_2 ) {
00189 //         kvot = double(num_cand_1)/double(num_cand_2);
00190 //       }
00191 //       else {
00192 //         kvot = double(num_cand_2)/double(num_cand_1);
00193 //       }
00194       
00195       if ( num_cand_1 < Dmin ) {
00196         //cerr<<"Too few candidates! i: "<<i<<", j: "<<j<<endl;
00197         break;
00198       }
00199       int DNPfound(0);
00200       double tot_prob=0;
00201       double col_prob=0;
00202       double p1(-1), p2(-1), corr_prob(-1);
00203 
00204       if ( num_corr >= Dmin ) { //NOTA BENE! Assumes Dmin = 3...
00205         //Computation of "the probability of observing any of the two columns by chance".
00206         
00207         p1 = poisson_probability(exp_err_first, num_cand_1);
00208         p2 = poisson_probability(exp_err_second, num_cand_1);
00209         col_prob = 1.0 - (1.0 - p1) * (1.0 - p2);
00210         
00211         assert( col_prob >= 0.0 && col_prob <= 1.0);//A bit shaky...
00212         
00213         corr_prob = poisson_probability(expectation_val, num_corr);
00214         tot_prob = col_prob * corr_prob;
00215         
00216 
00217         if (tot_prob <= p_tot_max || num_corr > Dmin )DNPfound = 1;
00218        
00219       }
00220       
00221       
00222       if ( DNPfound ) {
00223 
00224         if ( !confirmed_DNP_pos[ cand_vec[i] ] || 
00225              !confirmed_DNP_pos[ cand_vec[j] ]) {
00226           num_pos++;
00227         }
00228         
00229 
00230         assert( cand_vec[i] < (int)confirmed_DNP_pos.size() );
00231         assert( cand_vec[j] < (int)confirmed_DNP_pos.size() );
00232 
00233         confirmed_DNP_pos[ cand_vec[i] ] = 1;
00234         confirmed_DNP_pos[ cand_vec[j] ] = 1;
00235         set_DNPs( cand_vec[i], cand_vec[j], 
00236                   cand_bases[i], cand_bases[j], rownum_vec, 
00237                   i, j, cons_bases[i], cons_bases[j]);
00238       }
00239 
00240     }
00241   }
00242 }
00243 
00244 // }}}
00245 
00246 
00247 void DNPclass::set_DNPs( int col1, int col2, char DNPbase1, char DNPbase2, 
00248                          vector<int> &rownum_vec, int ID1, int ID2, 
00249                          char cons1, char cons2)
00250 // {{{ 
00251 
00252 {
00253 
00254   for( int i = 0; i < (int)rownum_vec.size(); i++ ) {
00255     
00256     if ( m->get_entry( rownum_vec[i], col1 ) == DNPbase1 &&
00257          m->get_entry( rownum_vec[i], col2 ) == DNPbase2 &&
00258          m->get_qual(rownum_vec[i], col1) >= qualmin &&
00259          m->get_qual(rownum_vec[i], col2) >= qualmin) {
00260       //Set DNP
00261       if ( !m->is_DNP( rownum_vec[i] , col1 ) ) {
00262         num_found++;
00263       }
00264       if ( !m->is_DNP( rownum_vec[i] , col2 ) ) {
00265         num_found++;
00266       }
00267       
00268 
00269       m->set_DNP( rownum_vec[i] , col1, true, ID1, dnp_type(DNPbase1, cons1) );
00270       m->set_DNP( rownum_vec[i] , col2, true, ID2, dnp_type(DNPbase2, cons2));
00271 
00272       
00273     }
00274   }
00275 }
00276 
00277 // }}}
00278 
00279 void DNPclass::look_for_corr( vector<char> &first_col, vector<char> &second_col, 
00280                               vector<int> &first_qual_col, vector<int> &second_qual_col, 
00281                               char cand_base_1, char cand_base_2, int &num_err_first, int &num_err_second, 
00282                               double &exp_err_first, double &exp_err_second, int &num_corr, 
00283                               double &expectation_val)
00284   // {{{ 
00285 
00286 {
00287   //NOTA BENE! This function currently performs its analysis
00288   //with regard to the candidate base. That is, every base
00289   //present on the column is regarded as correct if it is of
00290   //another type than the candidate base.
00291 
00292   //The two results are stored in expectation_val and returned
00293   //in col_prob, respectively.
00294 
00295   num_err_first = num_err_second = 0;
00296   exp_err_first = exp_err_second = 0.0;
00297   expectation_val = 0;
00298   num_corr = 0;
00299   
00300   //Pre-computation of necessary params
00301 
00302   for( int i = 0; i < (int)first_col.size(); i++ ) {
00303     assert( first_qual_col[i] > -1 && first_qual_col[i] < 100 );
00304     assert( second_qual_col[i] > -1 && second_qual_col[i] < 100 );
00305     
00306     int no_1_cand(0), no_2_cand(0);
00307 
00308     exp_err_first += quality2error[ first_qual_col[i] ];
00309     exp_err_second += quality2error[ second_qual_col[i] ];
00310     
00311     
00312     if ( first_col[i] == cand_base_1 && first_qual_col[i] >= qualmin ) {
00313       num_err_first++;
00314       no_1_cand = 1;
00315     }
00316     if ( second_col[i] == cand_base_2 && second_qual_col[i] >= qualmin ) {
00317       num_err_second++;
00318       no_2_cand = 1;
00319     }
00320     if ( no_1_cand && no_2_cand )
00321       num_corr++;
00322   }
00323   
00324 
00325   //Computation of the expectation value for the number of coinciding deviations
00326   
00327   for( int i = 0; i < (int)first_col.size(); i++ ) {
00328     double err1 = quality2error[ first_qual_col[i] ];
00329     double err2 = quality2error[ second_qual_col[i] ];
00330     double new_sum = exp_err_first - err1;
00331     double Exp1 = num_err_first * err1 / ( num_err_first * err1 + new_sum * (1.0 - err1) );
00332     new_sum = exp_err_second - err2;
00333     double Exp2 = num_err_second * err2 / ( num_err_second * err2 + new_sum * (1.0 - err2) );
00334     
00335     expectation_val += Exp1 * Exp2;
00336   }
00337 
00338 }
00339 
00340 // }}}
00341 double DNPclass::poisson_probability(double lambda, int obs)
00342   // {{{ 
00343 
00344 {
00345   //   if ( lambda < 0.0 ) {
00346   //     cerr<<"lambda: "<<lambda<<endl;
00347   //   }
00348   assert( lambda >= 0.0);
00349   assert( obs >= 0 );
00350   double M = lambda;
00351   int N_in = obs;
00352   int N = N_in - 1;
00353   int J       (0);
00354   double S = exp(-M);
00355   double res(0.0);
00356   
00357   if(N < 0){
00358     return 1.0;
00359   }
00360   
00361 
00362   if(S==0){
00363     J = N;
00364     if(J > M){
00365       J = int(M);
00366     }
00367 
00368     double F(0.0);
00369     for( int i = 1; i <= J; i++ ) {
00370       F += log((double)i);
00371     }
00372     double L = -M-F+J*log(M);
00373     L = exp(L);
00374     F=1;
00375 
00376     double T(0.0);
00377 
00378     for( int i = 1; i <= J; i++ ) {
00379       F *= (J+1-i)/M;
00380       T += F;
00381     }
00382     
00383     if(J==N){
00384       T=(T+1)*L;
00385       res = 1.0 - T;
00386 
00387     }
00388     else{
00389       F=1;
00390       for( int i = 1; i <= N-J; i++ ) {
00391         F *= M/(i+J);
00392         T=T+F;
00393       }
00394       T=(T+1)*L;
00395       res = (1.0 - T);
00396     }
00397 
00398   }
00399   else{
00400     
00401     double T=S;
00402     for( int i = 0; i <= N-1; i++ ) {
00403       S = S*M/(i+1);
00404       T += S;
00405     }
00406     res = (1.0 - T);
00407   }
00408 
00409   if ( res < 0.0 ) {
00410     if ( -res < 0.0000000000001 )
00411       res = 0.0;
00412   }
00413   else if ( res > 1.0 ) {
00414     if ( res - 1.0 < 0.0000000000001 )
00415       res = 1.0;
00416   }
00417   assert( res <= 1.0 );
00418   assert( res >= 0.0 );
00419 
00420   return res;
00421 }
00422 
00423 // }}}
00424 void DNPclass::pick_base_row_and_qual_columns(int firstDNPindex, int secondDNPindex, 
00425                                               vector<char> &firstDNPvec, vector<char> &secondDNPvec,
00426                                               vector<int> &first_qual_vec, vector<int> &second_qual_vec,
00427                                               vector<int> &rownum_vec) 
00428   // {{{ 
00429 
00430 {
00431   firstDNPvec.erase( firstDNPvec.begin(), firstDNPvec.end() );
00432   secondDNPvec.erase( secondDNPvec.begin(), secondDNPvec.end() );
00433   first_qual_vec.erase( first_qual_vec.begin(), first_qual_vec.end() );
00434   second_qual_vec.erase( second_qual_vec.begin(), second_qual_vec.end() );
00435   rownum_vec.erase( rownum_vec.begin(), rownum_vec.end() );
00436 
00437   for( int j = 0; j < m->get_num_rows(); j++ ) {
00438     
00439     if ( firstDNPindex >= m->get_row_end(j) || firstDNPindex < m->get_row_start(j) )
00440       continue;
00441     if ( secondDNPindex >= m->get_row_end(j) || secondDNPindex < m->get_row_start(j) )
00442       continue;
00443 
00444     //Both bases come from the same read
00445 
00446     char baseOnPos1 = base_trans[ m->get_entry(j, firstDNPindex) ];      
00447     char baseOnPos2 = base_trans[ m->get_entry(j, secondDNPindex) ];
00448 
00449     //EK 030408
00450     if(baseOnPos1 == 'x' || baseOnPos2 == 'x')
00451       continue;
00452     
00453     assert(baseOnPos1 == 'a' || baseOnPos1 == 't' || baseOnPos1 == 'g' || 
00454            baseOnPos1 == 'c' || baseOnPos1 == 'n' || baseOnPos1 == '-' ||
00455            baseOnPos1 == 'x' || baseOnPos1 == '*' );
00456     assert(baseOnPos2 == 'a' || baseOnPos2 == 't' || baseOnPos2 == 'g' || 
00457            baseOnPos2 == 'c' || baseOnPos2 == 'n' || baseOnPos2 == '-' ||
00458            baseOnPos2 == 'x' || baseOnPos2 == '*');
00459     
00460     int qualOnPos1 = m->get_qual(j, firstDNPindex);      
00461     int qualOnPos2 = m->get_qual(j, secondDNPindex);
00462     
00463     firstDNPvec.push_back(baseOnPos1);
00464     secondDNPvec.push_back(baseOnPos2);
00465 
00466     first_qual_vec.push_back(qualOnPos1);
00467     second_qual_vec.push_back(qualOnPos2);
00468 
00469     assert( j < m->get_num_rows() );
00470 
00471     rownum_vec.push_back( j );
00472   }
00473 }
00474 
00475 // }}}