00001 #include <limits>
00002 #include <algorithm>
00003 #include "acewriteralgo.h"
00004 #include <fstream>
00005
00006 using namespace std;
00007
00008 void AceWriter::start()
00009 {
00010 write_ace_file();
00011 }
00012
00013
00014 void AceWriter::write_ace_file()
00015 {
00016
00017
00018 cerr<<"Writing ace file... ";
00019
00020 string dir = ".";
00021 string file_name = "trappercontig";
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 int tot_num_of_reads_in_ace_file;
00038
00039 string contig_name = "contig1";
00040 int num_of_bases;
00041 int num_of_reads_in_contig;
00042 int num_of_segments_in_contig;
00043 vector<char> consensus;
00044
00045 int num_of_unpadded_bases;
00046 vector<short> consensus_quality_values;
00047 consensus_quality_values.push_back(50);
00048
00049 vector<string> read_name;
00050 vector<char> UorC;
00051 vector<int> padded_start_consensus_position;
00052
00053 vector<int> BS_padded_start_consensus_position;
00054 vector<int> BS_padded_end_consensus_position;
00055 vector<string> BS_read_name;
00056
00057 vector<int> num_of_padded_bases;
00058 int num_of_whole_read_info_items = 0;
00059 vector<int> num_of_read_tags;
00060 vector< vector<char> > read_seq;
00061
00062 vector<int> qual_clipping_start;
00063 vector<int> qual_clipping_end;
00064 vector<int> align_clipping_start;
00065 vector<int> align_clipping_end;
00066
00067 vector<string> name_of_chromat_file;
00068 vector<string> name_of_phd_file;
00069 string date_of_phd_file = "Thu Mar 19 18:38:28 1998";
00070
00071 vector< vector<string> > tag_type;
00072 string what_program_created_tag = "TRAPPER";
00073 vector< vector<int> > padded_read_pos_start;
00074 vector< vector<int> > padded_read_pos_end;
00075 string date_when_tag_was_created = "980319:183828";
00076 vector< vector<string> > tag_data;
00077
00078
00079
00080
00081
00082
00083 cerr<<" (Setting parameter values.) ";
00084
00085 vector<char> tmp_charvec;
00086 vector<int> tmp_start, tmp_end;
00087 vector<string> tmp_data, tmp_type;
00088
00089
00090 vector<int> rowVec;
00091 int consensus_start = get_consensus(consensus, rowVec, num_of_unpadded_bases);
00092 num_of_bases = consensus.size();
00093
00094
00095 int id;
00096 int begin_pos;
00097 int len;
00098 int aligned_len;
00099 string name;
00100
00101 tot_num_of_reads_in_ace_file = 0;
00102
00103
00104
00105 int num_rows = getMAl()->get_num_seq ();
00106
00107
00108 for(int i=0; i < num_rows; i++) {
00109
00110
00111
00112 tmp_start.clear(); tmp_end.clear();
00113 tmp_data.clear(); tmp_type.clear();
00114
00115 tmp_charvec.clear();
00116
00117
00118
00119 id = i;
00120 name = getMAl()->get_name(id);
00121
00122
00123 if ( ! get_realignment_row(i, tmp_charvec, begin_pos) )
00124 cerr << "Id: " << id <<" have been removed from alignment!"<<endl;
00125 else {
00126 tot_num_of_reads_in_ace_file++;
00127
00128 UorC.push_back( getMAl()->get_strand(id)[0] );
00129
00130
00131
00132 read_name.push_back(name);
00133 name_of_phd_file.push_back( name + ".phd.1");
00134 name_of_chromat_file.push_back( name + ".ab1");
00135
00136 aligned_len = tmp_charvec.size();
00137
00138
00139 int offset=0;
00140
00141
00142 padded_start_consensus_position.push_back(begin_pos - consensus_start + 1 - offset);
00143
00144
00145 int paddedCount = 0;
00146 for( int j = 0; j < aligned_len; j++ ) {
00147
00148
00149 if ( tmp_charvec[j] == '*' ) paddedCount++;
00150
00151
00152
00153
00154 else if( getMAl()->is_DNP(id, j) ) {
00155 tmp_start.push_back( j + 1 + offset );
00156 tmp_end.push_back( j + 1 + offset );
00157 tmp_data.push_back("a DNP position");
00158 tmp_type.push_back("matchElsewhereHighQual");
00159 }
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176 }
00177
00178
00179
00180 vector<char> TheWholeSeq;
00181
00182
00183 for( int j = 0; j < aligned_len; j++ )
00184 TheWholeSeq.push_back(tmp_charvec[j]);
00185
00186
00187 len = TheWholeSeq.size();
00188
00189
00190
00191 num_of_read_tags.push_back(tmp_start.size());
00192
00193 padded_read_pos_start.push_back(tmp_start);
00194 padded_read_pos_end.push_back(tmp_end);
00195
00196 tag_type.push_back(tmp_type);
00197 tag_data.push_back(tmp_data);
00198
00199 read_seq.push_back(TheWholeSeq);
00200 num_of_padded_bases.push_back(len);
00201
00202 qual_clipping_start.push_back(1);
00203 qual_clipping_end.push_back(aligned_len);
00204 align_clipping_start.push_back(1);
00205 align_clipping_end.push_back(aligned_len);
00206 }
00207 }
00208
00209 num_of_reads_in_contig = tot_num_of_reads_in_ace_file;
00210
00211
00212
00213
00214 int curr = rowVec[0];
00215
00216 num_of_segments_in_contig = 1;
00217 BS_padded_start_consensus_position.push_back(1);
00218 string tmp = getMAl()->get_name(rowVec[0]);
00219 BS_read_name.push_back( tmp );
00220
00221 for( int n = 1; n < (int)rowVec.size(); n++ ) {
00222 if ( curr != rowVec[n] ) {
00223 BS_padded_end_consensus_position.push_back(n);
00224 num_of_segments_in_contig++;
00225 BS_padded_start_consensus_position.push_back(n+1);
00226 tmp = getMAl()->get_name(rowVec[n]);
00227 BS_read_name.push_back( tmp );
00228 curr = rowVec[n];
00229 }
00230 }
00231 BS_padded_end_consensus_position.push_back(rowVec.size());
00232
00233
00234
00235
00236
00237 if ( tot_num_of_reads_in_ace_file == 0 )
00238 return;
00239 else {
00240
00241 if(access(dir.data(), W_OK) == -1) { cerr << "Cannot write ace-files to " << dir << endl; exit(-1); }
00242
00243 ofstream outFile((dir + "/" + file_name + ".ace").data());
00244
00245 cerr<<" (Printing to file.) ";
00246
00247
00248 int num_contigs = 1;
00249 outFile<<"AS "<<num_contigs<<" "<<tot_num_of_reads_in_ace_file<<endl;
00250 outFile<<endl;
00251
00252
00253 outFile<<"CO "<<contig_name<<" "<<num_of_bases
00254 <<" "<<num_of_reads_in_contig<<" "<<num_of_segments_in_contig
00255 <<" U"<<endl;
00256 for( int i = 0; i < num_of_bases; i++ ) {
00257 if ( i != 0 && i % 50 == 0 )
00258 outFile<<endl;
00259 outFile<<consensus[i];
00260 }
00261 outFile<<endl;
00262 outFile<<endl;
00263
00264
00265 outFile<<"BQ "<<endl;
00266 for( int i = 0; i < num_of_unpadded_bases; i++ ) {
00267 if ( i != 0 && i % 50 == 0 )
00268 outFile<<endl;
00269 outFile<<' '<<consensus_quality_values[0];
00270 }
00271 outFile<<endl;
00272 outFile<<endl;
00273
00274
00275 for( int i = 0; i < num_of_reads_in_contig; i++ ) {
00276
00277 outFile<<"AF "<<read_name[i]<<" "<<UorC[i]
00278 <<" "<<padded_start_consensus_position[i]<<endl;
00279 }
00280
00281
00282 for( int i = 0; i < num_of_segments_in_contig; i++ ) {
00283
00284 outFile<<"BS "<<BS_padded_start_consensus_position[i]
00285 <<" "<<BS_padded_end_consensus_position[i]
00286 <<" "<<BS_read_name[i]<<endl;
00287 }
00288 outFile<<endl;
00289
00290
00291 for( int i = 0; i < num_of_reads_in_contig; i++ ) {
00292
00293
00294
00295 outFile<<"RD "<<read_name[i]<<" "<<num_of_padded_bases[i]
00296 <<" "<<num_of_whole_read_info_items
00297 <<" "<<num_of_read_tags[i]<<endl;
00298 for( int j = 0; j < num_of_padded_bases[i]; j++ ) {
00299 if ( j != 0 && j % 50 == 0 )
00300 outFile<<endl;
00301 outFile<<read_seq[i][j];
00302 }
00303 outFile<<endl;
00304 outFile<<endl;
00305
00306
00307 outFile<<"QA "<<qual_clipping_start[i]
00308 <<" "<<qual_clipping_end[i]
00309 <<" "<<align_clipping_start[i]
00310 <<" "<<align_clipping_end[i]<<endl;
00311 outFile<<endl;
00312
00313
00314 outFile<<"DS CHROMAT_FILE: "<<name_of_chromat_file[i]
00315 <<" PHD_FILE: "<<name_of_phd_file[i]
00316 <<" TIME: "<<date_of_phd_file<<endl;
00317 outFile<<endl;
00318
00319
00320 for( int j = 0; j < (int)padded_read_pos_start[i].size(); j++ ) {
00321 outFile<<"RT{"<<endl;
00322 outFile<<read_name[i]<<' '<<tag_type[i][j]<<' '
00323 <<what_program_created_tag<<' '
00324 <<padded_read_pos_start[i][j]<<' '
00325 <<padded_read_pos_end[i][j]<<' '
00326 <<date_when_tag_was_created<<endl;
00327 outFile<<tag_data[i][j]<<endl;
00328 outFile<<'}'<<endl;
00329 outFile<<endl;
00330 }
00331 }
00332
00333 cerr<<" Done. "<<endl;
00334 outFile.close();
00335 }
00336
00337 }
00338
00339
00340 int AceWriter::get_consensus(vector <char> &cons, vector<int> &rowVec, int &num) {
00341
00342 cons.clear();
00343 rowVec.clear();
00344 num = 0;
00345
00346 int min_row_value = numeric_limits<int>::max();
00347 int num_cols = 0;
00348
00349 for(int i=0; i < getMAl()->get_num_seq(); i++){
00350 if( num_cols < getMAl()->get_seq_end_global(i))
00351 num_cols = getMAl()->get_seq_end_global(i);
00352 }
00353
00354 for( int c = 0; c < num_cols; c++ ) {
00355 short Acount(0), Tcount(0), Gcount(0), Ccount(0), GAPcount(0);
00356 int Arow(1), Trow(1), Grow(1), Crow(1), GAProw(1);
00357
00358 for( int r = 0; r < getMAl()->get_num_seq(); r++ ) {
00359
00360 if ( min_row_value > getMAl()->get_seq_begin_global(r) )
00361 min_row_value = getMAl()->get_seq_begin_global(r);
00362
00363 if( c >= getMAl()->get_seq_begin_global(r) && c < getMAl()->get_seq_end_global(r) ){
00364 char base = getMAl()->get_base_global(r, c);
00365
00366 if( base == 'a' ) { Acount++; if ( Acount == 1 ) Arow = r; }
00367 else if( base == 't' ){ Tcount++; if ( Tcount == 1 ) Trow = r; }
00368 else if( base == 'g' ){ Gcount++; if ( Gcount == 1 ) Grow = r; }
00369 else if( base == 'c' ){ Ccount++; if ( Ccount == 1 ) Crow = r; }
00370 else if( base == '*' || base == 'n'){ GAPcount++; if ( GAPcount == 1 ) GAProw = r; }
00371
00372 }
00373 }
00374 if( Acount != 0 && Acount >= Tcount && Acount >= Gcount && Acount >= Ccount && Acount >= GAPcount )
00375 { cons.push_back( 'a' ); rowVec.push_back( Arow ); }
00376 else if( Tcount != 0 && Tcount >= Gcount && Tcount >= Ccount && Tcount >= GAPcount )
00377 { cons.push_back( 't' ); rowVec.push_back( Trow ); }
00378 else if( Gcount != 0 && Gcount >= Ccount && Gcount >= GAPcount )
00379 { cons.push_back( 'g' ); rowVec.push_back( Grow ); }
00380 else if( Ccount != 0 && Ccount >= GAPcount )
00381 { cons.push_back( 'c' ); rowVec.push_back( Crow ); }
00382 else if( GAPcount != 0 ) {
00383 num++;
00384 cons.push_back( '*' );
00385 rowVec.push_back( GAProw );
00386 }
00387 }
00388 assert( cons.size() == rowVec.size());
00389 num = cons.size() - num;
00390 return min_row_value;
00391
00392 }
00393
00394 int AceWriter::get_realignment_row(const int row, vector<char> &aligned_read, int &row_start) {
00395
00396 string tmp = getMAl()->get_seq(row);
00397 copy(tmp.begin(), tmp.end(), back_inserter(aligned_read));
00398 row_start = getMAl()->get_seq_begin_global(row);
00399 return getMAl()->get_seq_end_global(row);
00400 }
00401