Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

acewriteralgo.cc

Go to the documentation of this file.
00001 #include <limits>
00002 #include <algorithm>
00003 #include "acewriteralgo.h"
00004 #include <fstream>
00005 
00006 using namespace std;
00007 
00008 void AceWriter::start()
00009 {
00010   write_ace_file();
00011 }
00012 
00013 
00014 void AceWriter::write_ace_file() 
00015 { 
00016   //This is adapted from old code, needs a rewrite!
00017   
00018   cerr<<"Writing ace file... ";
00019 
00020   string dir = ".";
00021   string file_name = "trappercontig";
00022 
00023 
00024   // The parameters name are taken from the documentation of the 
00025   // ace file format doc at; 
00026   // http://www.phrap.org/consed/distributions/README.13.0.txt
00027 
00028   // There's a lot of variables in this function. It's possible to
00029   // write this function with less variables, but thay are there
00030   // to make it easier to rewrite the function if you want to add stuff.
00031 
00032   // ----------------------------------------------------------------------
00033   // ---------- Declerations of parameters with default values!
00034   // ----------------------------------------------------------------------
00035 
00036   // AS parameters
00037   int tot_num_of_reads_in_ace_file;
00038   // CO parameters
00039   string contig_name = "contig1";
00040   int num_of_bases;
00041   int num_of_reads_in_contig;
00042   int num_of_segments_in_contig;
00043   vector<char> consensus; 
00044   // BQ parameters
00045   int num_of_unpadded_bases;
00046   vector<short> consensus_quality_values;
00047   consensus_quality_values.push_back(50); // The consensus will get 50
00048   // AF parameters
00049   vector<string> read_name;
00050   vector<char> UorC;
00051   vector<int> padded_start_consensus_position;
00052   // BS parameters
00053   vector<int> BS_padded_start_consensus_position;
00054   vector<int> BS_padded_end_consensus_position;
00055   vector<string> BS_read_name;
00056   // RD parameters
00057   vector<int> num_of_padded_bases; 
00058   int num_of_whole_read_info_items = 0;  // puch dakota!
00059   vector<int> num_of_read_tags;            
00060   vector< vector<char> > read_seq;
00061   // QA parameters
00062   vector<int> qual_clipping_start;
00063   vector<int> qual_clipping_end;
00064   vector<int> align_clipping_start;
00065   vector<int> align_clipping_end;
00066   // DS parameters
00067   vector<string> name_of_chromat_file;
00068   vector<string> name_of_phd_file;
00069   string date_of_phd_file = "Thu Mar 19 18:38:28 1998"; // Northwest Energy Review Transition Board - Mar 19, 1998 report
00070   // RT parameters
00071   vector< vector<string> > tag_type;
00072   string what_program_created_tag = "TRAPPER";
00073   vector< vector<int> > padded_read_pos_start;
00074   vector< vector<int> > padded_read_pos_end;
00075   string date_when_tag_was_created = "980319:183828";
00076   vector< vector<string> > tag_data;
00077   // CA & WA are at the moment not in this function!
00078 
00079 
00080   // ----------------------------------------------------------------------
00081   // ---------- Set parameter values.
00082   // ----------------------------------------------------------------------
00083   cerr<<" (Setting parameter values.) ";  
00084   
00085   vector<char> tmp_charvec;
00086   vector<int> tmp_start, tmp_end;
00087   vector<string> tmp_data, tmp_type;
00088   
00089   // ################################# Get consensus #################################
00090   vector<int> rowVec;
00091   int consensus_start = get_consensus(consensus, rowVec, num_of_unpadded_bases);
00092   num_of_bases = consensus.size();
00093   
00094   // ################################# Get info from the cosmid #################################
00095   int id;
00096   int begin_pos;
00097   int len;
00098   int aligned_len;
00099   string name;
00100   //bool OK;
00101   tot_num_of_reads_in_ace_file = 0;
00102 
00103   // Mod to output query sequence in ace file.. DN 040407 
00104 
00105   int num_rows = getMAl()->get_num_seq ();
00106   
00107 
00108   for(int i=0; i < num_rows; i++) { // Loop over all aligned reads + query
00109 
00110     // Make room
00111     // for RT
00112     tmp_start.clear(); tmp_end.clear();
00113     tmp_data.clear(); tmp_type.clear();
00114     // and seq
00115     tmp_charvec.clear();
00116 
00117       
00118     // Get the name
00119     id = i;
00120     name = getMAl()->get_name(id); 
00121     
00122     // ################################# Get seq from real_matrix
00123     if ( ! get_realignment_row(i, tmp_charvec, begin_pos) ) 
00124       cerr << "Id: " << id <<" have been removed from alignment!"<<endl;
00125     else {
00126       tot_num_of_reads_in_ace_file++; // One more not empty read;
00127 
00128       UorC.push_back( getMAl()->get_strand(id)[0] );
00129       
00130       
00131       // Store names
00132       read_name.push_back(name);
00133       name_of_phd_file.push_back( name + ".phd.1");
00134       name_of_chromat_file.push_back( name + ".ab1");
00135       
00136       aligned_len = tmp_charvec.size();
00137       
00138       //int offset = current_cos->get_beginGood(id) - current_cos->get_seq_begin(id);
00139       int offset=0;//No offset?
00140       
00141       // Set start of the read aginst the consensus!
00142       padded_start_consensus_position.push_back(begin_pos - consensus_start + 1 - offset);
00143       
00144       // ################################# Set RT:s 
00145       int paddedCount = 0;
00146       for( int j = 0; j < aligned_len; j++ ) {
00147 
00148         // Count gaps
00149         if ( tmp_charvec[j] == '*'  ) paddedCount++;
00150         
00151         // DNP position
00152 
00153 
00154         else if( getMAl()->is_DNP(id, j) ) {
00155           tmp_start.push_back( j + 1 + offset );
00156           tmp_end.push_back( j + 1 + offset );
00157           tmp_data.push_back("a DNP position");
00158           tmp_type.push_back("matchElsewhereHighQual");
00159         }
00160         
00161         // Marked region!
00162 //         else if ( tmp_charvec[j] == 'x' ) {
00163 //           tmp_start.push_back( j + 1 + offset );
00164 //           tmp_data.push_back("Marked region");
00165 //           tmp_type.push_back("significantDiscrepancy");
00166           
00167 //           // Go through the whole marked region
00168 //           while((tmp_charvec[j] == '*' || tmp_charvec[j] == 'x') && j < aligned_len  ) {
00169 //             if ( tmp_charvec[j] == '*') paddedCount++;
00170 //             else tmp_charvec[j] = current_cos->get_base(id, current_cos->get_beginGood(id) + j - paddedCount);
00171 //             j++;
00172 //           }
00173 //           tmp_end.push_back(j + offset);
00174 //         }
00175 
00176       }
00177 
00178       // ################################# Adds the bases that was not aligned
00179 
00180       vector<char> TheWholeSeq; // = head + tmp_charvec + tail
00181       // Head
00182 
00183       for( int j = 0; j < aligned_len; j++ ) 
00184         TheWholeSeq.push_back(tmp_charvec[j]);
00185       // Tail
00186       
00187       len = TheWholeSeq.size();
00188       
00189       
00190       // Save everything!
00191       num_of_read_tags.push_back(tmp_start.size());
00192       
00193       padded_read_pos_start.push_back(tmp_start);
00194       padded_read_pos_end.push_back(tmp_end);
00195       
00196       tag_type.push_back(tmp_type);
00197       tag_data.push_back(tmp_data);
00198       
00199       read_seq.push_back(TheWholeSeq);
00200       num_of_padded_bases.push_back(len);
00201       
00202       qual_clipping_start.push_back(1);
00203       qual_clipping_end.push_back(aligned_len);
00204       align_clipping_start.push_back(1);
00205       align_clipping_end.push_back(aligned_len);
00206     }
00207   }
00208   
00209   num_of_reads_in_contig = tot_num_of_reads_in_ace_file; // One contig in acefile
00210   
00211   // ################################# Set BS values! #################################
00212   // It will give bs that are sorted, nonoverlapping and match the consensus!
00213 
00214   int curr = rowVec[0];
00215   
00216   num_of_segments_in_contig = 1;
00217   BS_padded_start_consensus_position.push_back(1);
00218   string tmp = getMAl()->get_name(rowVec[0]);
00219   BS_read_name.push_back( tmp );
00220   
00221   for( int n = 1; n < (int)rowVec.size(); n++ ) {
00222     if ( curr != rowVec[n] ) {
00223       BS_padded_end_consensus_position.push_back(n);
00224       num_of_segments_in_contig++;
00225       BS_padded_start_consensus_position.push_back(n+1);
00226       tmp = getMAl()->get_name(rowVec[n]);
00227       BS_read_name.push_back( tmp );
00228       curr = rowVec[n];
00229     }
00230   }
00231   BS_padded_end_consensus_position.push_back(rowVec.size());
00232   
00233   
00234   // ----------------------------------------------------------------------
00235   // ---------- Write ace file.
00236   // ----------------------------------------------------------------------
00237   if ( tot_num_of_reads_in_ace_file == 0 )
00238     return;
00239   else {
00240     // Disk full?
00241     if(access(dir.data(), W_OK) == -1) { cerr << "Cannot write ace-files to " << dir << endl; exit(-1); }
00242     
00243     ofstream outFile((dir + "/" + file_name + ".ace").data());
00244     
00245     cerr<<" (Printing to file.) ";
00246     
00247     // write AS
00248     int num_contigs = 1;
00249     outFile<<"AS "<<num_contigs<<" "<<tot_num_of_reads_in_ace_file<<endl;
00250     outFile<<endl;
00251     
00252     // write CO
00253     outFile<<"CO "<<contig_name<<" "<<num_of_bases
00254            <<" "<<num_of_reads_in_contig<<" "<<num_of_segments_in_contig
00255            <<" U"<<endl;
00256     for( int i = 0; i < num_of_bases; i++ ) {
00257       if ( i != 0 && i % 50 == 0 )
00258         outFile<<endl;
00259       outFile<<consensus[i];
00260     }
00261     outFile<<endl;
00262     outFile<<endl;
00263     
00264     // write BQ
00265     outFile<<"BQ "<<endl;
00266     for( int i = 0; i < num_of_unpadded_bases; i++ ) {
00267       if ( i != 0 && i % 50 == 0 )
00268         outFile<<endl;
00269       outFile<<' '<<consensus_quality_values[0];
00270     }
00271     outFile<<endl;
00272     outFile<<endl;
00273   
00274     // write AF
00275     for( int i = 0; i < num_of_reads_in_contig; i++ ) {
00276       // Change parameters value!
00277       outFile<<"AF "<<read_name[i]<<" "<<UorC[i]
00278              <<" "<<padded_start_consensus_position[i]<<endl;
00279     }
00280     
00281     // write BS
00282     for( int i = 0; i < num_of_segments_in_contig; i++ ) {
00283       // Change parameters value!
00284       outFile<<"BS "<<BS_padded_start_consensus_position[i]
00285              <<" "<<BS_padded_end_consensus_position[i]
00286              <<" "<<BS_read_name[i]<<endl;
00287     }
00288     outFile<<endl;
00289     
00290     // write RD, QA & DS
00291     for( int i = 0; i < num_of_reads_in_contig; i++ ) {
00292       // Change parameters value!
00293       
00294       // write RD
00295       outFile<<"RD "<<read_name[i]<<" "<<num_of_padded_bases[i]
00296              <<" "<<num_of_whole_read_info_items
00297              <<" "<<num_of_read_tags[i]<<endl;
00298       for( int j = 0; j < num_of_padded_bases[i]; j++ ) {
00299         if ( j != 0 && j % 50 == 0 )
00300           outFile<<endl;
00301         outFile<<read_seq[i][j];
00302       }
00303       outFile<<endl;
00304       outFile<<endl;
00305       
00306       // write QA
00307       outFile<<"QA "<<qual_clipping_start[i]
00308              <<" "<<qual_clipping_end[i]
00309              <<" "<<align_clipping_start[i]
00310              <<" "<<align_clipping_end[i]<<endl;
00311       outFile<<endl;
00312       
00313       // write DS
00314       outFile<<"DS CHROMAT_FILE: "<<name_of_chromat_file[i]
00315              <<" PHD_FILE: "<<name_of_phd_file[i]
00316              <<" TIME: "<<date_of_phd_file<<endl;
00317       outFile<<endl;
00318       
00319       // write RT
00320       for( int j = 0; j < (int)padded_read_pos_start[i].size(); j++ ) {
00321         outFile<<"RT{"<<endl;
00322         outFile<<read_name[i]<<' '<<tag_type[i][j]<<' '
00323                <<what_program_created_tag<<' '
00324                <<padded_read_pos_start[i][j]<<' '
00325                <<padded_read_pos_end[i][j]<<' '
00326                <<date_when_tag_was_created<<endl;
00327         outFile<<tag_data[i][j]<<endl;
00328         outFile<<'}'<<endl;
00329         outFile<<endl;
00330       }
00331     }
00332 
00333     cerr<<" Done. "<<endl;
00334     outFile.close();
00335   }
00336   
00337 }
00338 
00339 
00340 int AceWriter::get_consensus(vector <char> &cons, vector<int> &rowVec, int &num) {
00341 
00342   cons.clear();
00343   rowVec.clear();
00344   num = 0;
00345   
00346   int min_row_value = numeric_limits<int>::max();
00347   int num_cols = 0;
00348   
00349   for(int i=0; i < getMAl()->get_num_seq(); i++){
00350     if( num_cols < getMAl()->get_seq_end_global(i))
00351       num_cols = getMAl()->get_seq_end_global(i);
00352   }
00353   
00354   for( int c = 0; c < num_cols; c++ ) {   
00355     short Acount(0), Tcount(0), Gcount(0), Ccount(0), GAPcount(0);
00356     int Arow(1), Trow(1), Grow(1), Crow(1), GAProw(1);
00357     
00358     for( int r = 0; r < getMAl()->get_num_seq(); r++ ) {
00359       
00360       if ( min_row_value > getMAl()->get_seq_begin_global(r) )
00361         min_row_value = getMAl()->get_seq_begin_global(r);
00362       
00363       if( c >= getMAl()->get_seq_begin_global(r) && c < getMAl()->get_seq_end_global(r)  ){
00364         char base = getMAl()->get_base_global(r, c);
00365         
00366         if( base == 'a' )     { Acount++; if ( Acount == 1 ) Arow = r; }
00367         else if( base == 't' ){ Tcount++; if ( Tcount == 1 ) Trow = r; }
00368         else if( base == 'g' ){ Gcount++; if ( Gcount == 1 ) Grow = r; }
00369         else if( base == 'c' ){ Ccount++; if ( Ccount == 1 ) Crow = r; }
00370         else if( base == '*' || base == 'n'){ GAPcount++; if ( GAPcount == 1 ) GAProw = r; }
00371         
00372       }
00373     }
00374     if( Acount != 0 && Acount >= Tcount && Acount >= Gcount && Acount >= Ccount && Acount >= GAPcount )
00375       { cons.push_back( 'a' ); rowVec.push_back( Arow ); }//
00376     else if( Tcount != 0 && Tcount >= Gcount && Tcount >= Ccount && Tcount >= GAPcount )
00377       { cons.push_back( 't' ); rowVec.push_back( Trow ); }//
00378     else if( Gcount != 0 && Gcount >= Ccount && Gcount >= GAPcount )
00379       { cons.push_back( 'g' ); rowVec.push_back( Grow ); }//
00380     else if( Ccount != 0 && Ccount >= GAPcount )
00381       { cons.push_back( 'c' ); rowVec.push_back( Crow ); }//
00382     else if( GAPcount != 0 ) {
00383       num++;
00384       cons.push_back( '*' ); 
00385       rowVec.push_back( GAProw );
00386     }
00387   }
00388   assert( cons.size() == rowVec.size());
00389   num = cons.size() - num; // Number of unpadded bases!
00390   return min_row_value;
00391   
00392 }
00393 
00394 int AceWriter::get_realignment_row(const int row, vector<char> &aligned_read, int &row_start) {
00395 
00396   string tmp = getMAl()->get_seq(row);
00397   copy(tmp.begin(), tmp.end(), back_inserter(aligned_read));
00398   row_start = getMAl()->get_seq_begin_global(row);
00399   return getMAl()->get_seq_end_global(row);
00400 }
00401 

Generated on Fri Mar 17 17:44:24 2006 for trapper by  doxygen 1.4.4