00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include <iostream>
00016 #include <cmath>
00017 #include <cassert>
00018 #include "trash.h"
00019
00020
00021 using namespace std;
00022
00023 void TrashAlgo::start()
00024
00025 {
00026
00027
00028
00029 TRASH_WINDOW_LEN = 35;
00030 TRASH_WINDOW_STEP = 3;
00031 MIN_NUM_ADJ_WIN_GOOD = 4;
00032 MIN_LEN_GOOD = 50;
00033 GOOD_PERCENTAGE = 0.89;
00034 MAX_VARIANCE_IN_QUALWIN = 0.0035;
00035
00036
00037
00038 for( int i = 0; i < 100; i++ ) {
00039 logQual_to_quality.push_back( 1 - pow(10.0,double(-i)/10.0) );
00040 decode_qualval_square.push_back( logQual_to_quality[i] * logQual_to_quality[i] );
00041 }
00042
00043
00044
00045
00046 for( size_t i = 0; i < getMAl()->get_num_seq(); i++ ) {
00047
00048 trash(i);
00049
00050
00051
00052
00053
00054
00055
00056 }
00057
00058
00059 }
00060
00061
00062 void TrashAlgo::trash(size_t seqNumber)
00063
00064
00065 {
00066
00067
00068
00069
00070
00071
00072
00073 int num_adj_win_good(0);
00074 int begin_good(-1), end_good(-1);
00075 int bg_found(0), eg_found(0);
00076 int reset_good(1), curr_win_good(0);
00077
00078
00079 bool break_for(false);
00080 int start = 0;
00081 int stop = getMAl()->get_seq_end(seqNumber) - TRASH_WINDOW_LEN + 1;
00082
00083 for( int j = start; j < stop; j += TRASH_WINDOW_STEP ) {
00084
00085 if ( reset_good ) {
00086 begin_good = j;
00087 }
00088 reset_good = 0;
00089
00090
00091 int num_bases_in_win(0);
00092 int num_gaps_in_win(0);
00093 while ( num_bases_in_win < TRASH_WINDOW_LEN ) {
00094 if ( j + TRASH_WINDOW_LEN + num_gaps_in_win > getMAl()->get_seq_end(seqNumber) ) {
00095
00096 break_for = true;
00097 break;
00098 }
00099
00100 if ( getMAl()->get_base( seqNumber, j + num_bases_in_win + num_gaps_in_win ) == '-' ) {
00101
00102 num_gaps_in_win++;
00103 }
00104 else {
00105 num_bases_in_win++;
00106 }
00107 }
00108
00109 if ( break_for ) break;
00110
00111 j += check_qual_in_win(j, j + TRASH_WINDOW_LEN + num_gaps_in_win, seqNumber, curr_win_good);
00112
00113
00114
00115
00116 if ( curr_win_good ) {
00117 num_adj_win_good++;
00118
00119 if ( num_adj_win_good >= MIN_NUM_ADJ_WIN_GOOD ) {
00120
00121 bg_found = 1;
00122 }
00123 }
00124 else {
00125 if ( !bg_found ) {
00126 reset_good = 1;
00127 num_adj_win_good = 0;
00128 }
00129 }
00130
00131 if ( bg_found ) {
00132 break;
00133 }
00134 }
00135
00136
00137
00138
00139 if ( bg_found ) {
00140 reset_good = 1;
00141 curr_win_good = 0;
00142 num_adj_win_good = 0;
00143
00144 break_for = false;
00145
00146 start = getMAl()->get_seq_end(seqNumber) - 1;
00147 stop = TRASH_WINDOW_LEN - 2;
00148
00149 for( int j = start; j > stop; j -= TRASH_WINDOW_STEP ) {
00150
00151 if ( reset_good ) {
00152 end_good = j + 1;
00153 }
00154
00155 reset_good = 0;
00156
00157
00158 int num_bases_in_win(0);
00159 int num_gaps_in_win(0);
00160 while ( num_bases_in_win < TRASH_WINDOW_LEN ) {
00161
00162 if ( j - TRASH_WINDOW_LEN + 1 - num_gaps_in_win < 0 ) {
00163
00164 break_for = true;
00165 break;
00166 }
00167 if ( getMAl()->get_base( seqNumber, j - num_gaps_in_win - num_bases_in_win ) == '-' ) {
00168 num_gaps_in_win++;
00169 }
00170 else {
00171 num_bases_in_win++;
00172 }
00173 }
00174
00175 if ( break_for ) break;
00176
00177
00178 j -= check_qual_in_win(j - TRASH_WINDOW_LEN + 1 - num_gaps_in_win, j + 1, seqNumber, curr_win_good);
00179
00180
00181
00182
00183 if ( curr_win_good ) {
00184 num_adj_win_good++;
00185
00186 if ( num_adj_win_good >= MIN_NUM_ADJ_WIN_GOOD ) {
00187
00188 eg_found = 1;
00189 }
00190 }
00191 else {
00192 if ( !eg_found ) {
00193 reset_good = 1;
00194 num_adj_win_good = 0;
00195 }
00196 }
00197
00198 if ( eg_found ) {
00199 break;
00200 }
00201 }
00202 }
00203
00204
00205
00206 int len = end_good - begin_good;
00207 if ( ( bg_found && eg_found ) && ( len >= MIN_LEN_GOOD ) ) {
00208
00209 getMAl()->set_beginGood(seqNumber, begin_good);
00210 getMAl()->set_endGood(seqNumber, end_good);
00211
00212 }
00213 else {
00214
00215 kill_read(seqNumber);
00216 }
00217
00218
00219
00220
00221 }
00222
00223
00224 int TrashAlgo::check_qual_in_win(int start, int stop, int ID, int &curr_win_good)
00225
00226
00227 {
00228
00229
00230
00231 assert( stop <= getMAl()->get_seq_end( ID) );
00232
00233 double sum_qual(0.0), sum_qual_square(0.0), average_qual(0.0);
00234 int minLogQual(0);
00235 int num_gap(0);
00236
00237
00238 for( int k = start; k < stop; k++ ) {
00239
00240
00241
00242
00243
00244
00245
00246
00247 if ( getMAl()->get_base( ID, k ) == '-' ) {
00248
00249 num_gap++;
00250 continue;
00251 }
00252
00253
00254 sum_qual += logQual_to_quality[ getMAl()->get_qual(ID, k) ];
00255 sum_qual_square += decode_qualval_square[ getMAl()->get_qual(ID, k) ];
00256 int tempQual = getMAl()->get_qual(ID, k);
00257 if(tempQual >= minLogQual)
00258 minLogQual = tempQual;
00259 }
00260
00261 average_qual = sum_qual / (double)TRASH_WINDOW_LEN;
00262
00263 if ( average_qual >= GOOD_PERCENTAGE )
00264 {
00265
00266
00267 double Q = sum_qual_square - sum_qual*sum_qual/(double)TRASH_WINDOW_LEN;
00268 assert(TRASH_WINDOW_LEN > 1);
00269 double variance = Q / (double)(TRASH_WINDOW_LEN - 1);
00270 if ( variance <= MAX_VARIANCE_IN_QUALWIN || minLogQual >= 27) {
00271
00272 curr_win_good = 1;
00273 }
00274 else {
00275
00276 curr_win_good = 0;
00277 }
00278
00279 }
00280 else
00281 {
00282
00283 curr_win_good = 0;
00284 }
00285
00286 if ( stop - start != TRASH_WINDOW_LEN + num_gap ) {
00287 cerr<<"stop - start: "<<stop - start<<endl;
00288 cerr<<"TRASH_WINDOW_LEN + num_gap: "<<TRASH_WINDOW_LEN + num_gap<<endl;
00289 cerr<<"stop: "<<stop<<endl;
00290 cerr<<"start: "<<start<<endl;
00291 cerr<<"TRASH_WINDOW_LEN: "<<TRASH_WINDOW_LEN<<endl;
00292 cerr<<"num_gap: "<<num_gap<<endl;
00293 }
00294
00295
00296 assert( stop - start == TRASH_WINDOW_LEN + num_gap );
00297 return num_gap;
00298 }
00299
00300
00301 void TrashAlgo::kill_read(int ID)
00302
00303
00304 {
00305
00306 getMAl()->set_beginGood( ID, getMAl()->get_len( ID )/2 );
00307 getMAl()->set_endGood( ID, getMAl()->get_len( ID )/2 );
00308 }
00309
00310