#include <acewriteralgo.h>
Inheritance diagram for AceWriter:
Public Member Functions | |
AceWriter (TrapperDoc *pDoc_, std::set< db_recno_t > &recnoList, AlgoParam *param) | |
void | start () |
Private Member Functions | |
void | write_ace_file () |
int | get_consensus (std::vector< char > &cons, std::vector< int > &rowVec, int &num) |
int | get_realignment_row (const int row, std::vector< char > &aligned_read, int &row_start) |
Definition at line 4 of file acewriteralgo.h.
|
Definition at line 7 of file acewriteralgo.h. 00007 : 00008 RAlgo(pDoc_, recnoList, param) {}
|
|
Referenced by write_ace_file(). |
|
Referenced by write_ace_file(). |
|
Implements RAlgo. Definition at line 8 of file acewriteralgo.cc. References write_ace_file(). 00009 { 00010 write_ace_file(); 00011 }
|
|
Definition at line 14 of file acewriteralgo.cc. References get_consensus(), MAl_Readonly::get_name(), MAl_Readonly::get_num_seq(), get_realignment_row(), RAlgo::getMAl(), and MAl_Readonly::is_DNP(). Referenced by start(). 00015 { 00016 //This is adapted from old code, needs a rewrite! 00017 00018 cerr<<"Writing ace file... "; 00019 00020 string dir = "."; 00021 string file_name = "trappercontig"; 00022 00023 00024 // The parameters name are taken from the documentation of the 00025 // ace file format doc at; 00026 // http://www.phrap.org/consed/distributions/README.13.0.txt 00027 00028 // There's a lot of variables in this function. It's possible to 00029 // write this function with less variables, but thay are there 00030 // to make it easier to rewrite the function if you want to add stuff. 00031 00032 // ---------------------------------------------------------------------- 00033 // ---------- Declerations of parameters with default values! 00034 // ---------------------------------------------------------------------- 00035 00036 // AS parameters 00037 int tot_num_of_reads_in_ace_file; 00038 // CO parameters 00039 string contig_name = "contig1"; 00040 int num_of_bases; 00041 int num_of_reads_in_contig; 00042 int num_of_segments_in_contig; 00043 vector<char> consensus; 00044 // BQ parameters 00045 int num_of_unpadded_bases; 00046 vector<short> consensus_quality_values; 00047 consensus_quality_values.push_back(50); // The consensus will get 50 00048 // AF parameters 00049 vector<string> read_name; 00050 vector<char> UorC; 00051 vector<int> padded_start_consensus_position; 00052 // BS parameters 00053 vector<int> BS_padded_start_consensus_position; 00054 vector<int> BS_padded_end_consensus_position; 00055 vector<string> BS_read_name; 00056 // RD parameters 00057 vector<int> num_of_padded_bases; 00058 int num_of_whole_read_info_items = 0; // puch dakota! 00059 vector<int> num_of_read_tags; 00060 vector< vector<char> > read_seq; 00061 // QA parameters 00062 vector<int> qual_clipping_start; 00063 vector<int> qual_clipping_end; 00064 vector<int> align_clipping_start; 00065 vector<int> align_clipping_end; 00066 // DS parameters 00067 vector<string> name_of_chromat_file; 00068 vector<string> name_of_phd_file; 00069 string date_of_phd_file = "Thu Mar 19 18:38:28 1998"; // Northwest Energy Review Transition Board - Mar 19, 1998 report 00070 // RT parameters 00071 vector< vector<string> > tag_type; 00072 string what_program_created_tag = "TRAPPER"; 00073 vector< vector<int> > padded_read_pos_start; 00074 vector< vector<int> > padded_read_pos_end; 00075 string date_when_tag_was_created = "980319:183828"; 00076 vector< vector<string> > tag_data; 00077 // CA & WA are at the moment not in this function! 00078 00079 00080 // ---------------------------------------------------------------------- 00081 // ---------- Set parameter values. 00082 // ---------------------------------------------------------------------- 00083 cerr<<" (Setting parameter values.) "; 00084 00085 vector<char> tmp_charvec; 00086 vector<int> tmp_start, tmp_end; 00087 vector<string> tmp_data, tmp_type; 00088 00089 // ################################# Get consensus ################################# 00090 vector<int> rowVec; 00091 int consensus_start = get_consensus(consensus, rowVec, num_of_unpadded_bases); 00092 num_of_bases = consensus.size(); 00093 00094 // ################################# Get info from the cosmid ################################# 00095 int id; 00096 int begin_pos; 00097 int len; 00098 int aligned_len; 00099 string name; 00100 //bool OK; 00101 tot_num_of_reads_in_ace_file = 0; 00102 00103 // Mod to output query sequence in ace file.. DN 040407 00104 00105 int num_rows = getMAl()->get_num_seq (); 00106 00107 00108 for(int i=0; i < num_rows; i++) { // Loop over all aligned reads + query 00109 00110 // Make room 00111 // for RT 00112 tmp_start.clear(); tmp_end.clear(); 00113 tmp_data.clear(); tmp_type.clear(); 00114 // and seq 00115 tmp_charvec.clear(); 00116 00117 00118 // Get the name 00119 id = i; 00120 name = getMAl()->get_name(id); 00121 00122 // ################################# Get seq from real_matrix 00123 if ( ! get_realignment_row(i, tmp_charvec, begin_pos) ) 00124 cerr << "Id: " << id <<" have been removed from alignment!"<<endl; 00125 else { 00126 tot_num_of_reads_in_ace_file++; // One more not empty read; 00127 00128 UorC.push_back( getMAl()->get_strand(id)[0] ); 00129 00130 00131 // Store names 00132 read_name.push_back(name); 00133 name_of_phd_file.push_back( name + ".phd.1"); 00134 name_of_chromat_file.push_back( name + ".ab1"); 00135 00136 aligned_len = tmp_charvec.size(); 00137 00138 //int offset = current_cos->get_beginGood(id) - current_cos->get_seq_begin(id); 00139 int offset=0;//No offset? 00140 00141 // Set start of the read aginst the consensus! 00142 padded_start_consensus_position.push_back(begin_pos - consensus_start + 1 - offset); 00143 00144 // ################################# Set RT:s 00145 int paddedCount = 0; 00146 for( int j = 0; j < aligned_len; j++ ) { 00147 00148 // Count gaps 00149 if ( tmp_charvec[j] == '*' ) paddedCount++; 00150 00151 // DNP position 00152 00153 00154 else if( getMAl()->is_DNP(id, j) ) { 00155 tmp_start.push_back( j + 1 + offset ); 00156 tmp_end.push_back( j + 1 + offset ); 00157 tmp_data.push_back("a DNP position"); 00158 tmp_type.push_back("matchElsewhereHighQual"); 00159 } 00160 00161 // Marked region! 00162 // else if ( tmp_charvec[j] == 'x' ) { 00163 // tmp_start.push_back( j + 1 + offset ); 00164 // tmp_data.push_back("Marked region"); 00165 // tmp_type.push_back("significantDiscrepancy"); 00166 00167 // // Go through the whole marked region 00168 // while((tmp_charvec[j] == '*' || tmp_charvec[j] == 'x') && j < aligned_len ) { 00169 // if ( tmp_charvec[j] == '*') paddedCount++; 00170 // else tmp_charvec[j] = current_cos->get_base(id, current_cos->get_beginGood(id) + j - paddedCount); 00171 // j++; 00172 // } 00173 // tmp_end.push_back(j + offset); 00174 // } 00175 00176 } 00177 00178 // ################################# Adds the bases that was not aligned 00179 00180 vector<char> TheWholeSeq; // = head + tmp_charvec + tail 00181 // Head 00182 00183 for( int j = 0; j < aligned_len; j++ ) 00184 TheWholeSeq.push_back(tmp_charvec[j]); 00185 // Tail 00186 00187 len = TheWholeSeq.size(); 00188 00189 00190 // Save everything! 00191 num_of_read_tags.push_back(tmp_start.size()); 00192 00193 padded_read_pos_start.push_back(tmp_start); 00194 padded_read_pos_end.push_back(tmp_end); 00195 00196 tag_type.push_back(tmp_type); 00197 tag_data.push_back(tmp_data); 00198 00199 read_seq.push_back(TheWholeSeq); 00200 num_of_padded_bases.push_back(len); 00201 00202 qual_clipping_start.push_back(1); 00203 qual_clipping_end.push_back(aligned_len); 00204 align_clipping_start.push_back(1); 00205 align_clipping_end.push_back(aligned_len); 00206 } 00207 } 00208 00209 num_of_reads_in_contig = tot_num_of_reads_in_ace_file; // One contig in acefile 00210 00211 // ################################# Set BS values! ################################# 00212 // It will give bs that are sorted, nonoverlapping and match the consensus! 00213 00214 int curr = rowVec[0]; 00215 00216 num_of_segments_in_contig = 1; 00217 BS_padded_start_consensus_position.push_back(1); 00218 string tmp = getMAl()->get_name(rowVec[0]); 00219 BS_read_name.push_back( tmp ); 00220 00221 for( int n = 1; n < (int)rowVec.size(); n++ ) { 00222 if ( curr != rowVec[n] ) { 00223 BS_padded_end_consensus_position.push_back(n); 00224 num_of_segments_in_contig++; 00225 BS_padded_start_consensus_position.push_back(n+1); 00226 tmp = getMAl()->get_name(rowVec[n]); 00227 BS_read_name.push_back( tmp ); 00228 curr = rowVec[n]; 00229 } 00230 } 00231 BS_padded_end_consensus_position.push_back(rowVec.size()); 00232 00233 00234 // ---------------------------------------------------------------------- 00235 // ---------- Write ace file. 00236 // ---------------------------------------------------------------------- 00237 if ( tot_num_of_reads_in_ace_file == 0 ) 00238 return; 00239 else { 00240 // Disk full? 00241 if(access(dir.data(), W_OK) == -1) { cerr << "Cannot write ace-files to " << dir << endl; exit(-1); } 00242 00243 ofstream outFile((dir + "/" + file_name + ".ace").data()); 00244 00245 cerr<<" (Printing to file.) "; 00246 00247 // write AS 00248 int num_contigs = 1; 00249 outFile<<"AS "<<num_contigs<<" "<<tot_num_of_reads_in_ace_file<<endl; 00250 outFile<<endl; 00251 00252 // write CO 00253 outFile<<"CO "<<contig_name<<" "<<num_of_bases 00254 <<" "<<num_of_reads_in_contig<<" "<<num_of_segments_in_contig 00255 <<" U"<<endl; 00256 for( int i = 0; i < num_of_bases; i++ ) { 00257 if ( i != 0 && i % 50 == 0 ) 00258 outFile<<endl; 00259 outFile<<consensus[i]; 00260 } 00261 outFile<<endl; 00262 outFile<<endl; 00263 00264 // write BQ 00265 outFile<<"BQ "<<endl; 00266 for( int i = 0; i < num_of_unpadded_bases; i++ ) { 00267 if ( i != 0 && i % 50 == 0 ) 00268 outFile<<endl; 00269 outFile<<' '<<consensus_quality_values[0]; 00270 } 00271 outFile<<endl; 00272 outFile<<endl; 00273 00274 // write AF 00275 for( int i = 0; i < num_of_reads_in_contig; i++ ) { 00276 // Change parameters value! 00277 outFile<<"AF "<<read_name[i]<<" "<<UorC[i] 00278 <<" "<<padded_start_consensus_position[i]<<endl; 00279 } 00280 00281 // write BS 00282 for( int i = 0; i < num_of_segments_in_contig; i++ ) { 00283 // Change parameters value! 00284 outFile<<"BS "<<BS_padded_start_consensus_position[i] 00285 <<" "<<BS_padded_end_consensus_position[i] 00286 <<" "<<BS_read_name[i]<<endl; 00287 } 00288 outFile<<endl; 00289 00290 // write RD, QA & DS 00291 for( int i = 0; i < num_of_reads_in_contig; i++ ) { 00292 // Change parameters value! 00293 00294 // write RD 00295 outFile<<"RD "<<read_name[i]<<" "<<num_of_padded_bases[i] 00296 <<" "<<num_of_whole_read_info_items 00297 <<" "<<num_of_read_tags[i]<<endl; 00298 for( int j = 0; j < num_of_padded_bases[i]; j++ ) { 00299 if ( j != 0 && j % 50 == 0 ) 00300 outFile<<endl; 00301 outFile<<read_seq[i][j]; 00302 } 00303 outFile<<endl; 00304 outFile<<endl; 00305 00306 // write QA 00307 outFile<<"QA "<<qual_clipping_start[i] 00308 <<" "<<qual_clipping_end[i] 00309 <<" "<<align_clipping_start[i] 00310 <<" "<<align_clipping_end[i]<<endl; 00311 outFile<<endl; 00312 00313 // write DS 00314 outFile<<"DS CHROMAT_FILE: "<<name_of_chromat_file[i] 00315 <<" PHD_FILE: "<<name_of_phd_file[i] 00316 <<" TIME: "<<date_of_phd_file<<endl; 00317 outFile<<endl; 00318 00319 // write RT 00320 for( int j = 0; j < (int)padded_read_pos_start[i].size(); j++ ) { 00321 outFile<<"RT{"<<endl; 00322 outFile<<read_name[i]<<' '<<tag_type[i][j]<<' ' 00323 <<what_program_created_tag<<' ' 00324 <<padded_read_pos_start[i][j]<<' ' 00325 <<padded_read_pos_end[i][j]<<' ' 00326 <<date_when_tag_was_created<<endl; 00327 outFile<<tag_data[i][j]<<endl; 00328 outFile<<'}'<<endl; 00329 outFile<<endl; 00330 } 00331 } 00332 00333 cerr<<" Done. "<<endl; 00334 outFile.close(); 00335 } 00336 00337 }
|