Scaffolding  0.1
This program can assemble genome scaffolds using the pairing information in paired-end reads.
min_matrix.hpp
1 
2 
3 #ifndef MIN_MATRIX_HPP
4 #define MIN_MATRIX_HPP
5 
6 #include <memory>
7 
8 #include "utils/utils.hpp"
9 #include "utils/graph_typedefs.hpp"
10 
11 // set if the solution should be handled by the DP algo
12 
13 #define HANDLE_SOLUTION(x) x
14 
15 namespace scaffold{
16  typedef std::list<EdgeName> EdgeNameList;
17  struct global_list;
18 
19  typedef std::shared_ptr<global_list> SharedGList;
20  typedef std::list<SharedGList> ListGList;
21  typedef std::shared_ptr<ListGList> SharedListGList;
22 
23 #define NO_EDGE ((EdgeName){NONAME, NONAME})
24 
25  // a global list of edges to be shared by all the DP-tables
26  struct global_list{
27  EdgeName e;
28 
29  SharedGList left, right;
30 
31  global_list(const EdgeName& _e, const SharedGList& _left = nullptr): e(_e), left(_left), right(nullptr) {};
32  global_list(const SharedGList& _left, const SharedGList _right): e(NO_EDGE), left(_left), right(_right) {};
33  };
34 
35  // an entry for a field of (path,cycle)
37  {
38  public:
39  size_t weight;
40  protected:
41  SharedGList edges;
42  public:
43  // copy constructor
44  MatrixEntry(const MatrixEntry& E): weight(E.weight), edges(E.edges) {}
45  // weight == SIZE_MAX will represent an undefined entry
46  MatrixEntry(const size_t _weight = SIZE_MAX): weight(_weight), edges(nullptr) {}
47  MatrixEntry(const EdgeName& e, const size_t _weight = SIZE_MAX): weight(_weight), edges(std::make_shared<global_list>(e)) {}
48  // handling undefinedness
49  bool is_undefined() const { return weight == SIZE_MAX; }
50  void set_undefined() { weight = SIZE_MAX; }
51 
52  bool has_no_edges(){
53  return edges == nullptr;
54  }
55 
56  void clear(){
57  weight = 0;
58  edges.reset();
59  }
60 
61  void append_edge(const EdgeName& e, const size_t _weight = 0)
62  {
63  weight += _weight;
64  HANDLE_SOLUTION(
65  if(edges != nullptr) {
66  // if we already have some edges, add e as the new head of the global list
67  edges = std::make_shared<global_list>(e, edges);
68  } else {
69  // if e is our first edge, just initialize our edges
70  edges = std::make_shared<global_list>(e);
71  }
72  )
73  }
74 
75  void join(const MatrixEntry& ME)
76  {
77  weight += ME.weight;
78  HANDLE_SOLUTION(
79  if(edges != nullptr){
80  // if we already have some edges, create a new splitting node pointing to both our and ME's edges
81  edges = std::make_shared<global_list>(edges, ME.edges);
82  } else {
83  // if we don't have any edges yet, then point to ME's edges
84  edges = ME.edges;
85  }
86  )
87  }
88 
89  EdgeNameList get_edges() const
90  {
91  if(edges != nullptr){
92  EdgeNameList result;
93  std::stack<SharedGList> el_stack;
94  el_stack.push(edges);
95  while(!el_stack.empty()){
96  SharedGList top_el = el_stack.top(); el_stack.pop();
97  const EdgeName& e = top_el->e;
98  if(e != NO_EDGE) result.push_back(e);
99  if(top_el->left != nullptr) el_stack.push(top_el->left);
100  if(top_el->right != nullptr) el_stack.push(top_el->right);
101  }
102  return result;
103  } else return EdgeNameList();
104  }
105 
106 
107  // assign the MatrixEntry
108  void operator=(const MatrixEntry& ME)
109  {
110  weight = ME.weight;
111  HANDLE_SOLUTION(edges = ME.edges;)
112  }
113 
114  // add weight and edges of another MatrixEntry
115  void operator+=(const MatrixEntry& ME)
116  {
117  join(ME);
118  }
119  MatrixEntry operator+(const MatrixEntry& ME) const
120  {
121  MatrixEntry result(*this);
122  result += ME;
123  return result;
124  }
125 
126  };
127 
128  // update an entry if it is worse than the given one & return whether an update took place
129  template<typename Index, typename Matrix, class Compare = std::less<size_t> >
130  bool update_if_worse(Matrix& m, const Index& index, MatrixEntry* ME)
131  {
132  const Compare is_better;
133  // get the entry at index, creating it as a copy of E if necessary
134  const typename Matrix::iterator ME_i = m.find(index);
135  if(ME_i == m.end()){
136  m.emplace(index, ME);
137  return true;
138  } else {
139  // if it existed before and its weight is worse than E's, then update it
140  if(is_better(ME->weight, ME_i->second->weight)) {
141  delete ME_i->second;
142  ME_i->second = ME;
143  return true;
144  } else return false;
145  }
146  }
147 /*
148  // update an entry if it is worse than the given one & return whether an update took place
149  template<typename Index, typename Matrix, class Compare = std::less<size_t> >
150  bool update_if_worse(Matrix& m, const Index& index, const SharedListGList& edges, const size_t weight)
151  {
152  MatrixEntry* ME = new MatrixEntry(edges, weight);
153  if(!update_if_worse<Compare>(m, index, ME)){
154  delete ME;
155  return false;
156  } else return true;
157  }
158  template<typename Index, typename Matrix, class Compare = std::less<size_t> >
159  bool update_if_worse(Matrix& m, const Index& index, const MatrixEntry& E)
160  {
161  return update_if_worse<Index, Matrix, Compare>(m, index, E.edges, E.weight);
162  }
163  */
164  // update an entry if it is worse than the sum of the given two & return whether an update took place
165  template<typename Index, typename Matrix, class Compare = std::less<size_t> >
166  bool update_if_worse(Matrix& m, const Index& index, const MatrixEntry& E, const MatrixEntry& cE)
167  {
168  const Compare is_better;
169  // get the entry at index, creating it as a copy of E if necessary
170  const typename Matrix::iterator ME_i = m.find(index);
171  if(ME_i == m.end()){
172  MatrixEntry* ME = new MatrixEntry(E);
173  *ME += cE;
174  m.emplace(index, ME);
175  return true;
176  } else {
177  MatrixEntry& ME = *(ME_i->second);
178  if(is_better(E.weight + cE.weight, ME.weight)){
179  // if it existed before and its weight is worse than E's, then update it
180  ME = E;
181  ME += cE;
182  return true;
183  } else return false;
184  }// if
185  }// function
186 
187 
188 
189  // a matrix of paths & cycles that keeps a minimum & refuses insertion of pareto-worse elements
190  template<class Compare = std::less<size_t> >
192  {
193  public:
194  typedef unordered_map<PathsAndCycles, MatrixEntry*> entries_t;
195 
196  // the bounds
197  const PathsAndCycles max;
198  protected:
199  entries_t* entries;
200  const Compare is_better;
201 
202  public:
203  PathsAndCycles best_index;
204 
205  PCMinMatrix(const unsigned paths, const unsigned cycles): max(paths, cycles), entries(new entries_t()), is_better() {}
206  PCMinMatrix(const PathsAndCycles& _max): max(_max), entries(new entries_t()), is_better() {}
207  PCMinMatrix(const PCMinMatrix& m):
208  max(m.max),
209  entries(new entries_t()),
210  is_better(),
211  best_index(m.best_index)
212  {
213  if(!m.entries->empty()){
214  const MatrixEntry* const best_ME = m.entries->at(best_index);
215  size_t best_weight = best_ME->weight;
216  entries->emplace(best_index, new MatrixEntry(*best_ME));
217  for(auto& ME: *m.entries)
218  if( !(best_index <= ME.first) || is_better(ME.second->weight, best_weight))
219  entries->emplace(ME.first, new MatrixEntry(*ME.second));
220  }
221  }
222  ~PCMinMatrix()
223  {
224  for(auto& ME: *entries) delete ME.second;
225  delete entries;
226  }
227 
228  // writing to the storage, return whether something was written
229  // NOTE: if an entry is overwritten, its memory is freed
230  bool put(const PathsAndCycles& index, MatrixEntry* ME)
231  {
232  if(index > max) return false;
233 
234  const bool was_empty = entries->empty();
235  const size_t best_weight = was_empty ? SIZE_MAX : entries->at(best_index)->weight;
236  const size_t weight = ME->weight;
237  // first, check if the best_index is better than what we are about to enter
238  if(was_empty || !(best_index <= index) || is_better(weight, best_weight)){
239  DEBUG5(std::cout << "trying to put entry at "<<index<<"..."<<std::flush);
240  if(!update_if_worse<PathsAndCycles, entries_t, Compare>(*entries, index, ME)) return false;
241  DEBUG5(std::cout << "success, now updating best_index ("<<best_index.p<<","<<best_index.c<<") if "<<was_empty<<" || Compare("<<weight<<","<<best_weight<<")"<<std::endl);
242  // finally, update best_index, throwing out the item at best_index if necessary
243  if(index != best_index){
244  if(!was_empty && (index < best_index) && is_better(weight, best_weight)){
245  const typename entries_t::iterator min_i = entries->find(best_index);
246  assert(min_i != entries->end());
247  delete min_i->second;
248  entries->erase(min_i);
249  }
250  if(was_empty || is_better(weight, best_weight)) best_index = index;
251  }
252  DEBUG5(std::cout << "new best_index is ("<<best_index.p<<","<<best_index.c<<")"<<std::flush<<" with weight "<<entries->at(best_index)->weight<<std::endl);
253  return true;
254  } else return false;
255  }
256 
257  bool put_empty(const PathsAndCycles& index)
258  {
259  MatrixEntry* ME = new MatrixEntry(0);
260  if(!put(index, ME)){
261  delete ME;
262  return false;
263  } else return true;
264  }
265  // put an entry containing a single edge
266  bool put(const PathsAndCycles& index, const EdgeName& e, const size_t weight)
267  {
268  MatrixEntry* ME = new MatrixEntry(e, weight);
269  if(!put(index, ME)){
270  delete ME;
271  return false;
272  } else return true;
273  }
274  // put an entry containing all edges of g
275  template<class Graph>
276  bool put_all(const PathsAndCycles& index, const Graph& g)
277  {
278  MatrixEntry* ME = new MatrixEntry(0);
279  for(EdgeIterRange<Graph> er = boost::edges(g); er.first != er.second; ++er.first){
280  const Edge<Graph>& e = *er.first;
281  const size_t weight = boost::get(boost::edge_weight, g, e);
282  ME->append_edge(get_edge_name(e, g), weight);
283  }
284  if(!put(index, ME)) {
285  delete ME;
286  return false;
287  } else return true;
288  }
289  /*
290  bool put(const unsigned paths, const unsigned cycles, const SharedEdgeList& edges, const size_t weight)
291  {
292  return put(PathsAndCycles(paths, cycles), edges, weight);
293  }
294  bool put(const PathsAndCycles& index, const MatrixEntry& E)
295  {
296  MatrixEntry* ME = new MatrixEntry(E);
297  if(!put(index, ME)){
298  delete ME;
299  return false;
300  } else return true;
301  }
302  bool put(const unsigned paths, const unsigned cycles, const MatrixEntry& E)
303  {
304  return put(PathsAndCycles(paths, cycles), E);
305  }
306  */
307  // put an entry at index with the sum of the given entries
308  bool put(const PathsAndCycles& index, const MatrixEntry& E, const MatrixEntry& cE)
309  {
310  assert(index <= max);
311 
312  const size_t sum_weight = E.weight + cE.weight;
313  const bool was_empty = entries->empty();
314  const size_t best_weight = was_empty ? SIZE_MAX : entries->at(best_index)->weight;
315  // first, check if the best_index is better than what we are about to enter
316  if(was_empty || !(best_index <= index) || is_better(sum_weight, best_weight)){
317  if(!update_if_worse<PathsAndCycles, entries_t, Compare>(*entries, index, E, cE)) return false;
318  // finally, update best_index
319  if(index != best_index){
320  if(!was_empty && (index <= best_index) && is_better(sum_weight, best_weight)) entries->erase(best_index);
321  if(was_empty || is_better(sum_weight, best_weight)) best_index = index;
322  }
323  return true;
324  } else return false;
325  }// function
326 
327  // add an edge of certain weight to all entries
328  void add_to_all(const EdgeName& e, const size_t weight)
329  {
330  for(auto& ME : *entries) ME.second->append_edge(e, weight);
331  }
332 
333  // reading from the storage
334  const MatrixEntry& get(const PathsAndCycles& index) const
335  {
336  assert(index <= max);
337  return *(entries->at(index));
338  }
339  const MatrixEntry& get(const unsigned paths, const unsigned cycles) const
340  {
341  return get({paths, cycles});
342  }
343  const MatrixEntry& at(const PathsAndCycles& index) const
344  {
345  return get(index);
346  }
347  const MatrixEntry& at(const unsigned paths, const unsigned cycles) const
348  {
349  return get(paths, cycles);
350  }
351  std::pair<entries_t::const_iterator, entries_t::const_iterator> get_entries() const
352  {
353  return std::pair<entries_t::const_iterator, entries_t::const_iterator>(entries->begin(), entries->end());
354  }
355 
356  // merge a Matrix pc into our own, destroying pc in the process
357  void destructive_merge(PCMinMatrix<Compare>& pc)
358  {
359  DEBUG6(std::cout << "destructively merging "<<pc.entries->size()<<" entries into our "<<size()<<" entries"<<std::endl);
360  for(auto ME = pc.entries->begin(); ME != pc.entries->end(); ME = pc.entries->erase(ME)){
361  // try to put this entry into our own matrix
362  if(!put(ME->first, ME->second)){
363  // if we didn't put the entry into our table, we'll free its memory & remove it from the map
364  delete ME->second;
365  }// if
366  }// for
367  }// function
368 
369 
370  // modify a key by adding a number of paths & cycles
371  bool add_to_key(const PathsAndCycles& key, const PathsAndCycles& pc){
372  if(pc.p + pc.c > 0){
373  const entries_t::iterator i = entries->find(key);
374  assert(i != entries->end());
375  const MatrixEntry* ME = i->second;
376  entries->erase(i);
377  return put(key + pc, ME);
378  }
379  }
380  // modify each key, watch the pointer-juggling!
381  void add_to_each_key(const PathsAndCycles& pc){
382  if(pc.p + pc.c > 0){
383  // update max: doesn't change, min will be updated automatically
384  // update entries (write completely anew)
385  entries_t* old_entries = entries;
386  entries = new entries_t;
387  for(auto& ME: *old_entries) put(ME.first + pc, ME.second);
388  // delete old entries
389  delete old_entries;
390  }
391  }
392 
393  // find an entry E of me and an entry SE of S such that their indices sum up to (p,c) and their weight is min among all such entries
394  PathsAndCycles find_min_entry_combination_by_iteration(const PathsAndCycles& pc, const PCMinMatrix& m){
395  PathsAndCycles min_pc;
396  size_t best_weight = SIZE_MAX;
397 
398  for(auto E : *entries){
399  const PathsAndCycles Epc = E.first;
400  const entries_t::const_iterator mE = m.entries->find(pc - Epc);
401  if(mE != m.entries->end()){
402  if(is_better(E.second->weight + mE->second->weight, best_weight)){
403  best_weight = E.second->weight + mE->second->weight;
404  min_pc = Epc;
405  }// if
406  }// if
407  }// for
408  return min_pc;
409  }// function
410 
411  // find the minimum entry combination by testing all pairs (p',c') <= (p,c)
412  PathsAndCycles find_min_entry_combination_by_testing(const PathsAndCycles& pc, const PCMinMatrix& m){
413  PathsAndCycles min_pc;
414  size_t best_weight = SIZE_MAX;
415 
416  for(unsigned mp = 0; (mp <= pc.p) && (mp <= m.max.p); ++mp)
417  for(unsigned mc = 0; (mc <= pc.c) && (mc <= m.max.c); ++mc){
418  PathsAndCycles current(mp, mc);
419  // check if both entries exist
420  const entries_t::const_iterator mE = m.entries->find(current);
421  PathsAndCycles diff(pc - current);
422  if(mE != m.entries->end()){
423  const entries_t::const_iterator E = entries->find(diff);
424  if(E != entries->end())
425  if(is_better(mE->second->weight + E->second->weight, best_weight)){
426  best_weight = mE->second->weight + E->second->weight;
427  min_pc = diff;
428  } // if
429  } // if
430  } // for
431  return min_pc;
432  }// function
433 
434  // get a pair of references to solution entries such that their paths add up to p and their cycles add up to c
435  PathsAndCycles find_min_entry_combination(const PathsAndCycles& pc, const PCMinMatrix& m){
436  // if the number of entries is not too large yet, we iterate through the entries instead of testing all (p',c') <= (p,c)
437  if(entries->size() < std::min(pc.p * pc.c, m.max.p * m.max.c))
438  return find_min_entry_combination_by_iteration(pc, m);
439  else
440  return find_min_entry_combination_by_testing(pc, m);
441  }// function
442 
443  // output a minimum-weight MatrixEntry within the solution boundaries
444  const MatrixEntry& get_min_entry() const
445  {
446  return *(entries->at(best_index));
447  }
448 
449  const size_t size() const
450  {
451  return entries->size();
452  }
453 
454  // return the largest pc for which pc <= i holds for all indices (note that pc is not necessarily an existing index)
455  const PathsAndCycles smallest_index() const
456  {
457  if(entries->empty()) return PathsAndCycles(0,0);
458  PathsAndCycles result(UINT_MAX,UINT_MAX);
459  for(auto& i : *entries){
460  if(i.first.p < result.p) result.p = i.first.p;
461  if(i.first.c < result.c) result.c = i.first.c;
462  }
463  return result;
464  }
465 
466  // << operator for solutions outputting a table of weights
467  friend std::ostream& operator<<(std::ostream& os, const PCMinMatrix& m){
468  // write header
469  os << std::endl<< " ";
470  for(unsigned p = 0; p <= m.max.p; ++p) os <<"\t" << p;
471  os << "\t\tbest index = "<<m.best_index<<std::endl;
472  // write table
473  for(unsigned c = 0; c <= m.max.c; ++c){
474  if(c < 10) os << " ";
475  os << c<<":\t";
476  for(unsigned p = 0; p <= m.max.p; ++p){
477  try{ os << m.at(p, c).weight << "\t"; } catch(...) { os << "\t"; };
478  }
479  os << std::endl;
480  }// for
481  return os;
482  } // function
483 
484  };
485 
486 
487 }// namespace
488 
489 #endif
490 
Definition: min_matrix.hpp:191
unsigned paths
maximum number of paths that a solution graph should consist of
Definition: command_line.hpp:56
unsigned cycles
maximum number of cycles that a solution graph should consist of
Definition: command_line.hpp:58
Definition: read_adj_list.hpp:22
Definition: min_matrix.hpp:36
Definition: min_matrix.hpp:26
Definition: graph_typedefs.hpp:26