00001 00002 #ifndef MAL_READONLY_H_EA 00003 #define MAL_READONLY_H_EA 00004 00005 /** @author Erik Arner, Karolinska Institute, (c) Erik Arner 2003. 00006 @version MAl version 0.1 00007 */ 00008 00009 /** \brief MAl stands for Multiple Alignment, it is a container class 00010 for sequences in such an alignment. 00011 00012 The idea is that this class should have "two" interfaces, one 00013 alignment/matrix interface with global indexes, columns etc and 00014 one sequence collection interface similar to the old cosmid 00015 interface. 00016 00017 First version is a front end to the Berkeley DB. This could be 00018 made more flexible in future versions, allowing any storage of 00019 data using pImpl idiom. 00020 00021 Problem: BDB has 1-based indexing (recno), while every algo 00022 written in the TRAP system uses 0-based indexing. I'll stick to 00023 0-based indexing in this system, to avoid future hassle. 00024 00025 Future plans: make this system plugable along the same lines as 00026 the data classes in trapper, as the first implementation of the 00027 MAl class will be based on these classes. Also remove the coupling 00028 between this class and TrapperDoc... 00029 */ 00030 00031 #include <set> 00032 #include <vector> 00033 #include "trapperdoc.h"//maybe fwd-decl instead?? 00034 #include "db_cxx.h" 00035 00036 00037 //Public, global typedefs 00038 typedef char base_t; 00039 // typedef short int qual_t; 00040 typedef Q_UINT32 qual_t; 00041 00042 class MAl_Readonly 00043 { 00044 00045 public: 00046 00047 //'tors... 00048 MAl_Readonly(size_t bufsize, std::set<db_recno_t>& recnolist, TrapperDoc* pdoc); 00049 virtual ~MAl_Readonly(); 00050 00051 00052 void print_info(size_t ID) { 00053 cerr << "mal_readonly: print_info: ID = " << ID << endl; 00054 cerr << "get_seq_begin_global( ID ) = " << get_seq_begin_global( ID ) << endl; 00055 cerr << "get_seq_end_global( ID ) = " << get_seq_end_global( ID ) << endl; 00056 cerr << "get_len( ID ) = " << get_len( ID ) << endl; 00057 } 00058 00059 //Common methods 00060 00061 00062 size_t get_num_seq(); 00063 std::string get_name( size_t ID ); 00064 std::string get_header( size_t ID ); 00065 std::string get_seq( size_t ID ); 00066 std::string get_strand( size_t ID ); 00067 size_t get_len( size_t ID); 00068 void select_read( size_t ID, bool status ); 00069 00070 //Separate interfaces 00071 00072 size_t get_seq_row( size_t ID ); 00073 00074 size_t get_seq_begin( size_t ID ); 00075 size_t get_seq_begin_global( size_t ID ); 00076 00077 // int get_seq_begin( size_t ID );//FIX THIS 00078 // int get_seq_begin_global( size_t ID );//FIX THIS 00079 00080 size_t get_seq_end( size_t ID ); 00081 size_t get_seq_end_global( size_t ID ); 00082 00083 size_t get_beginGood( size_t ID ); 00084 size_t get_beginGood_global( size_t ID ); 00085 00086 size_t get_endGood( size_t ID ); 00087 size_t get_endGood_global( size_t ID ); 00088 00089 base_t get_base( size_t ID, size_t index ); 00090 base_t get_base_global( size_t ID, size_t index ); 00091 00092 qual_t get_qual( size_t ID, size_t index ); 00093 qual_t get_qual_global( size_t ID, size_t index ); 00094 00095 bool is_DNP(size_t ID, size_t index); 00096 bool is_DNP_global(size_t ID, size_t index); 00097 00098 int get_DNP_ID(size_t ID, size_t index); 00099 int get_DNP_ID_global(size_t ID, size_t index); 00100 00101 int get_DNP_type(size_t ID, size_t index); 00102 int get_DNP_type_global(size_t ID, size_t index); 00103 00104 protected: 00105 //Protected methods 00106 size_t get_buffID(size_t ID); 00107 size_t next_buffID(); 00108 00109 virtual void flush_buffer(size_t buffID ); 00110 void read_from_db(size_t buffID, size_t ID); 00111 void read_seq_from_db( db_recno_t recno, size_t buffID ); 00112 void read_feat_from_db( db_recno_t recno, size_t buffID, const string& data_type_name); 00113 00114 00115 //Protected structs 00116 00117 struct dnp_struct 00118 { 00119 dnp_struct(bool is = false, db_recno_t rec = 0, int id = -1, int t = -1 ) : 00120 isDNP(is), recno(rec), ID(id), type(t) {} 00121 00122 bool isDNP; 00123 db_recno_t recno; 00124 int ID; 00125 int type; 00126 }; 00127 00128 //Members 00129 size_t buff_size; 00130 size_t num_seq; 00131 TrapperDoc* doc; 00132 std::set<db_recno_t>& selectedReads; 00133 00134 //Maybe use deques instead?? 00135 //These guys are of the buffer size 00136 std::vector<std::vector<base_t> > seqs; 00137 std::vector<std::vector<qual_t> > quals; 00138 std::vector<std::vector<dnp_struct> > DNPs; 00139 std::vector<std::string> names; 00140 std::vector<std::string> headers; 00141 std::vector<std::string> mates; 00142 std::vector<std::string> strands; 00143 00144 std::vector<size_t> seq_rows; 00145 std::vector<size_t> seq_begin_global; 00146 std::vector<size_t> seq_end_global;//Unnecessary?? We have the sizes of seqs... 00147 std::vector<size_t> seq_beginGood;//NB, not global! 00148 std::vector<size_t> seq_endGood;//NB, not global! 00149 std::vector<size_t> mate_lengths; 00150 00151 //Buffer stuff 00152 vector<db_recno_t> ID_to_dbID;//Should be of actual data set size 00153 vector<size_t> ID_to_buffID;//Ditto 00154 vector<db_recno_t> buffID_to_dbID;//buffer size 00155 vector<db_recno_t> buffID_to_ID;//buffer size 00156 vector<bool> put_in_db;//Watch out for vector<bool>... 00157 00158 00159 }; 00160 00161 00162 #endif //MAL_READONLY_H_EA 00163 00164 00165 00166 //Should this stuff be private?? 00167 /* 00168 void change_base( size_t ID, size_t baseIndex, char newBase ); 00169 void set_seq_begin( size_t index, size_t pos ); 00170 void set_beginGood( const size_t index, const size_t pos ); 00171 void set_endGood( const size_t index, const size_t pos ); 00172 void insert_base(size_t ID, size_t before_index, char base); 00173 void remove_base(size_t ID, size_t index); 00174 void put_qual( size_t ID, size_t qualValIndex, const size_t qualityValue ); 00175 void delete_seq(size_t ID); 00176 bool is_deleted( size_t ID ); 00177 */ 00178 00179 00180 //OBSOLETE??? 00181 /* 00182 std::string get_headerQ( size_t ID ); 00183 size_t isPossibleRepeat(size_t ID);//???????????????????? 00184 void mark_possibleRepeat(size_t ID);//?????????????????? 00185 size_t append_seq( char seqName[], char seqHeader[] ); 00186 size_t get_first_revComp_index(); 00187 void set_max_coverage( size_t ID, size_t index, size_t cov ); 00188 size_t get_max_coverage( size_t ID, size_t index ); 00189 void set_pos_non_chimeric( size_t ID, size_t index); 00190 size_t is_chimeric( size_t ID, size_t index ); 00191 void set_beginAnalyzable( const size_t index, const size_t pos ); 00192 size_t get_beginAnalyzable( const size_t index ); 00193 void set_endAnalyzable( const size_t index, const size_t pos ); 00194 size_t get_endAnalyzable( const size_t index ); 00195 size_t get_number_seqs_in_file(const std::string fileName); 00196 size_t get_number_seqs_in_DATA_file(const std::string fileName); 00197 void set_qualBegin( const size_t index, const size_t pos ); 00198 size_t get_qualBegin( const size_t index ); 00199 void set_qualEnd( const size_t index, const size_t pos ); 00200 size_t get_qualEnd( const size_t index ); 00201 size_t is_quality( size_t index ); 00202 void mark_is_quality(size_t id); 00203 size_t seq_size( size_t ID ); 00204 size_t different_strands(size_t ID1, size_t ID2); 00205 size_t qual_size( size_t ID ); 00206 char get_comp_base(char base); 00207 size_t get_ID_in_revComp_counterpart(size_t ID); 00208 size_t get_index_in_revComp_counterpart(size_t ID, size_t index); 00209 char get_DNP(size_t ID, size_t index); 00210 void set_DNP(size_t ID, size_t index, char base, size_t unique); 00211 void set_templ_DNP(size_t ID, size_t index); 00212 void set_templ_DNP_pos(size_t ID, size_t index, size_t pos); 00213 size_t is_templ_DNP(size_t ID, size_t index); 00214 size_t get_templ_DNP_pos(size_t ID, size_t index); 00215 void set_DNP_ncorr(size_t ID, size_t index, size_t ncorr); 00216 size_t get_DNP_ncorr(size_t ID, size_t index); 00217 void set_DNP_p(size_t ID, size_t index, double p); 00218 double get_DNP_p(size_t ID, size_t index); 00219 void set_DNP_against_insert(size_t ID, size_t index, char base, size_t unique); 00220 void set_DNP_against_deletion(size_t ID, size_t index, size_t unique, size_t unset_DNP); 00221 char get_DNP_against_insert(size_t ID, size_t index); 00222 char get_DNP_against_deletion(size_t ID, size_t index); 00223 size_t get_DNP_type(size_t ID, size_t index); 00224 size_t get_DNP_type_against_insert(size_t ID, size_t index); 00225 size_t get_DNP_type_against_deletion(size_t ID, size_t index); 00226 size_t DNP_mismatch(size_t ID, size_t index, char base); 00227 size_t DNP_mismatch_against_insert(size_t ID, size_t index, char base); 00228 size_t DNP_mismatch_against_deletion(size_t ID, size_t index); 00229 */