Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

trash.cc

Go to the documentation of this file.
00001 /* FILE: trash_revcomp.cc
00002    Copyright Erik Arner and Martti Tammi, University of Uppsala,
00003    Dept. of Genetics and Pathology, 2000.
00004 
00005    This program uses the files produced by PHRED. The reads that don't
00006    have acceptable quality, will be deleted from the PHD and fasta files.
00007    An additional trash.mtas file is created and the deleted sequences
00008    are stored there.
00009 
00010    This program is a module in the TRAP program package.
00011 
00012    MOdified in 2004 to fit with the Trapper system.
00013 */
00014 
00015 #include <iostream>
00016 #include <cmath>
00017 #include <cassert>
00018 #include "trash.h"
00019 
00020 
00021 using namespace std;
00022 
00023 void TrashAlgo::start()
00024   // {{{ 
00025 {
00026 
00027 
00028 
00029   TRASH_WINDOW_LEN = 35;
00030   TRASH_WINDOW_STEP = 3;
00031   MIN_NUM_ADJ_WIN_GOOD = 4;
00032   MIN_LEN_GOOD = 50;
00033   GOOD_PERCENTAGE = 0.89;
00034   MAX_VARIANCE_IN_QUALWIN = 0.0035;
00035   
00036 
00037   //Note: this assumes that quality values are 99 at the most!
00038   for( int i = 0; i < 100; i++ ) {
00039     logQual_to_quality.push_back( 1 - pow(10.0,double(-i)/10.0) );
00040     decode_qualval_square.push_back( logQual_to_quality[i] * logQual_to_quality[i] );
00041   }
00042 
00043 
00044    
00045    
00046   for( size_t i = 0; i < getMAl()->get_num_seq(); i++ ) {
00047     
00048     trash(i);
00049     
00050     //Temp test
00051 //     for( size_t j = 0; j < getMAl()->get_seq_end( i ); j++ ) {
00052 //       if ( j < getMAl()->get_beginGood( i ) ||  j >= getMAl()->get_endGood( i ) )
00053 //         getMAl()->set_base( i, j, 'x' );
00054 //     }
00055     
00056   }
00057 
00058     
00059 }
00060 
00061 // }}}
00062 void TrashAlgo::trash(size_t seqNumber)
00063   // {{{ 
00064 
00065 {
00066   /* This function computes the quality regions of the
00067      sequences, and marks those sequences with too poor
00068      quality to be deleted from further investigation.
00069 
00070   */
00071 
00072 
00073   int num_adj_win_good(0);
00074   int begin_good(-1), end_good(-1);
00075   int bg_found(0), eg_found(0);
00076   int  reset_good(1), curr_win_good(0);
00077     
00078   //Left end
00079   bool break_for(false);
00080   int start = 0;
00081   int stop = getMAl()->get_seq_end(seqNumber) - TRASH_WINDOW_LEN + 1;
00082  
00083   for( int j = start; j < stop; j += TRASH_WINDOW_STEP ) {
00084       
00085     if ( reset_good ) {
00086       begin_good = j;
00087     }
00088     reset_good = 0;
00089     
00090     //Adjust window size for gaps
00091     int num_bases_in_win(0);
00092     int num_gaps_in_win(0);
00093     while ( num_bases_in_win < TRASH_WINDOW_LEN ) {
00094       if ( j + TRASH_WINDOW_LEN + num_gaps_in_win > getMAl()->get_seq_end(seqNumber) ) {
00095         
00096         break_for = true;
00097         break;
00098       }
00099 //       cerr<<getMAl()->get_base( seqNumber, j + num_bases_in_win + num_gaps_in_win );
00100       if ( getMAl()->get_base( seqNumber, j + num_bases_in_win + num_gaps_in_win ) == '-' ) {
00101 
00102         num_gaps_in_win++;
00103       }
00104       else {
00105         num_bases_in_win++;
00106       }
00107     }
00108 //     cerr<<endl;
00109     if ( break_for ) break;
00110 
00111     j += check_qual_in_win(j, j + TRASH_WINDOW_LEN + num_gaps_in_win, seqNumber, curr_win_good);
00112 
00113     /* Window status is now computed and it may be
00114        possible to set begin_good */
00115 
00116     if ( curr_win_good ) {//good or hq
00117       num_adj_win_good++;
00118       
00119       if ( num_adj_win_good >= MIN_NUM_ADJ_WIN_GOOD ) {
00120         //Found begin_good
00121         bg_found = 1;
00122       }
00123     }
00124     else {
00125       if ( !bg_found ) {
00126         reset_good = 1;
00127         num_adj_win_good = 0;
00128       }
00129     }
00130       
00131     if ( bg_found ) {
00132       break;
00133     }
00134   }//End left
00135 
00136   /* Right end -- same thing, but backwards.
00137      Unnecessary if begin good not found. */
00138 
00139   if ( bg_found ) {
00140     reset_good = 1;
00141     curr_win_good = 0;
00142     num_adj_win_good = 0;
00143 
00144     break_for = false;
00145     
00146     start = getMAl()->get_seq_end(seqNumber) - 1;
00147     stop = TRASH_WINDOW_LEN - 2;
00148       
00149     for( int j = start; j > stop; j -= TRASH_WINDOW_STEP ) {
00150         
00151       if ( reset_good ) {
00152         end_good = j + 1;
00153       }
00154 
00155       reset_good = 0;
00156 
00157       //Adjust window size for gaps
00158       int num_bases_in_win(0);
00159       int num_gaps_in_win(0);
00160       while ( num_bases_in_win < TRASH_WINDOW_LEN ) {
00161 //         cerr<<getMAl()->get_base( seqNumber, j - num_gaps_in_win - num_bases_in_win );
00162         if ( j - TRASH_WINDOW_LEN + 1 - num_gaps_in_win < 0 ) {
00163           
00164           break_for = true;
00165           break;
00166         }
00167         if ( getMAl()->get_base( seqNumber, j - num_gaps_in_win - num_bases_in_win ) == '-' ) {
00168           num_gaps_in_win++;
00169         }
00170         else {
00171           num_bases_in_win++;
00172         }
00173       }
00174 //       cerr<<endl;
00175       if ( break_for ) break;
00176       
00177       
00178       j -= check_qual_in_win(j - TRASH_WINDOW_LEN + 1 - num_gaps_in_win, j + 1, seqNumber, curr_win_good);
00179         
00180       /* Window status is now computed and it may be
00181          possible to set end_good */
00182         
00183       if ( curr_win_good ) {
00184         num_adj_win_good++;
00185           
00186         if ( num_adj_win_good >= MIN_NUM_ADJ_WIN_GOOD ) {
00187           //Found begin_good
00188           eg_found = 1;
00189         }
00190       }
00191       else {
00192         if ( !eg_found ) {
00193           reset_good = 1;
00194           num_adj_win_good = 0;
00195         }
00196       }
00197         
00198       if ( eg_found ) {
00199         break;
00200       }
00201     }//End right forj
00202   }//End right
00203     
00204   //Set the parameters for the read
00205     
00206   int len = end_good - begin_good;
00207   if ( ( bg_found && eg_found ) && ( len >= MIN_LEN_GOOD ) ) {
00208 
00209     getMAl()->set_beginGood(seqNumber, begin_good);
00210     getMAl()->set_endGood(seqNumber, end_good);
00211 
00212   }
00213   else {
00214 
00215     kill_read(seqNumber);
00216   }
00217 
00218   //}//End fori loop over all seqs
00219 
00220 
00221 }
00222 
00223 // }}}
00224 int TrashAlgo::check_qual_in_win(int start, int stop, int ID, int &curr_win_good)
00225   // {{{ 
00226 
00227 {
00228 
00229 //   cerr<<"stop: "<<stop<<endl;
00230 //   cerr<<"start: "<<start<<endl;
00231   assert( stop <= getMAl()->get_seq_end( ID) );
00232 
00233   double sum_qual(0.0), sum_qual_square(0.0), average_qual(0.0);
00234   int minLogQual(0);
00235   int num_gap(0);
00236 
00237   //Check quality in window
00238   for( int k = start; k < stop; k++ ) {
00239 //     cerr<<getMAl()->get_base( ID, k );
00240 //     if ( stop >= getMAl()->get_seq_end( ID) ) {
00241 //       assert( stop == getMAl()->get_seq_end( ID) );
00242 //       //Should not happen...
00243 // //       assert(false);
00244 //       break;
00245 //     }
00246     
00247     if ( getMAl()->get_base( ID, k ) == '-' ) {
00248 
00249       num_gap++;
00250       continue;
00251     }
00252     
00253     
00254     sum_qual += logQual_to_quality[ getMAl()->get_qual(ID, k) ];
00255     sum_qual_square += decode_qualval_square[ getMAl()->get_qual(ID, k) ];
00256     int tempQual = getMAl()->get_qual(ID, k);
00257     if(tempQual >= minLogQual)
00258       minLogQual = tempQual;
00259   }
00260 //   cerr<<endl;
00261   average_qual = sum_qual / (double)TRASH_WINDOW_LEN;
00262 
00263   if ( average_qual >= GOOD_PERCENTAGE ) 
00264     {
00265       
00266       //Window may qualify as good and high quality
00267       double Q = sum_qual_square - sum_qual*sum_qual/(double)TRASH_WINDOW_LEN;
00268       assert(TRASH_WINDOW_LEN > 1);
00269       double variance = Q / (double)(TRASH_WINDOW_LEN - 1);
00270       if ( variance <= MAX_VARIANCE_IN_QUALWIN || minLogQual >= 27) {
00271         //Window qualifies as good
00272         curr_win_good = 1;
00273       }
00274       else {
00275         //Variance too high for good and hq
00276         curr_win_good = 0;
00277       }
00278       
00279     }
00280   else 
00281     {
00282       //Window does not qualify as good or high quality
00283       curr_win_good = 0;
00284     }
00285 
00286   if ( stop - start != TRASH_WINDOW_LEN + num_gap ) {
00287     cerr<<"stop - start: "<<stop - start<<endl;
00288     cerr<<"TRASH_WINDOW_LEN + num_gap: "<<TRASH_WINDOW_LEN + num_gap<<endl;
00289     cerr<<"stop: "<<stop<<endl;
00290     cerr<<"start: "<<start<<endl;
00291     cerr<<"TRASH_WINDOW_LEN: "<<TRASH_WINDOW_LEN<<endl;
00292     cerr<<"num_gap: "<<num_gap<<endl;
00293   }
00294 //   cerr<<"num_gap: "<<num_gap<<endl;
00295   
00296   assert( stop - start == TRASH_WINDOW_LEN + num_gap );
00297   return num_gap;
00298 }
00299 
00300 // }}}
00301 void TrashAlgo::kill_read(int ID)
00302   // {{{ 
00303 
00304 {
00305 
00306   getMAl()->set_beginGood( ID, getMAl()->get_len( ID )/2 );
00307   getMAl()->set_endGood( ID, getMAl()->get_len( ID )/2 );
00308 }
00309 
00310 // }}}

Generated on Fri Mar 17 17:44:24 2006 for trapper by  doxygen 1.4.4