Scaffolding  0.1
This program can assemble genome scaffolds using the pairing information in paired-end reads.
scaffolding_utils.hpp
1 
2 
3 #ifndef SCAFFOLDING_UTILS_HPP
4 #define SCAFFOLDING_UTILS_HPP
5 
6 
7 #include "utils/graph_typedefs.hpp"
8 #include "utils/scaffolding_typedefs.hpp"
10 #include "utils/instance.hpp"
11 #include "utils/ambigous_paths.hpp"
12 
13 
14 using namespace boost;
15 
16 namespace scaffold{
17 
18  // get a list of contig jumps
19  void get_contig_jumps(const ScaffoldGraph& sg, std::list<contig_jump >& jumps)
20  {
21  const RawScaffoldGraph& g = sg.get_graph();
22  const ScafMatching& matching = sg.get_matching();
23 
24  for(const ScafMatching::value_type& contig: matching){
25  ScafVertex u = contig.first;
26  ScafVertex v = contig.second;
27  const ScafEdge& uv = sg.find_edge(u, v).first;
28  const unsigned contig_length = g[uv].length;
29 
30  // for a jump, we have to have a short contig...
31  if(contig_length < sg.get_graph_property().insert_size){
32  #warning TODO: implement detection of jumps over more than one contig
33  #warning TODO: for any given short contig, only keep the most supported jump
34  if(boost::degree(u, g) == 1) std::swap(u, v); // (( normalize deg(u) >= deg(v) ))
35  if((boost::degree(u, g) == 2) && (boost::degree(v, g) == 1)){
36  // ... that is at the end of a 2-path
37  ScafOEdgeIter e = boost::out_edges(u, g).first;
38  if(boost::target(*e, g) == v) ++e;
39  jumps.emplace_back(uv, std::initializer_list<ScafEdge >({*e}));
40  } else {
41  // ... or that is spanned by a 3-path
42  for(ScafOEdgeIterRange u_r = boost::out_edges(u, g); u_r.first != u_r.second; ++u_r.first){
43  const ScafEdge& ux = *u_r.first;
44  const ScafVertex& x = boost::target(ux, g);
45  if(x != v){// only consider non-contigs
46  for(ScafOEdgeIterRange v_r = boost::out_edges(v, g); v_r.first != v_r.second; ++v_r.first){
47  const ScafEdge& vy = *v_r.first;
48  const ScafVertex& y = boost::target(vy, g);
49  if( (y != u) && (matching.at(x) != y)){
50  const std::pair<ScafEdge, bool> xy_pair = boost::edge(x, y, g);
51  if(xy_pair.second)
52  jumps.emplace_back(uv, std::initializer_list<ScafEdge >({ux, xy_pair.first, vy}));
53  }// if
54  }// for
55  }// if
56  }// for
57  }// if
58  }// if
59  }// for
60  }
61 
62  // derive multiplicities for non-contig edges from the multiplicities of incident contig edges
63  void fix_non_matching_multiplicities(ScaffoldGraph& sg)
64  {
65  const RawScaffoldGraph& g = sg.get_graph();
66  for(auto range = boost::edges(g); range.first != range.second; ++range.first){
67  const ScafEdge& uv = *range.first;
68  if(!sg[uv].is_matching_edge()){
69  const ScafVertex& u = boost::source(uv, g);
70  const ScafVertex& v = boost::target(uv, g);
71  const ScafEdge ux = sg.incident_matching_edge(u);
72  const ScafEdge vy = sg.incident_matching_edge(v);
73  sg[uv].multiplicity = std::min(sg[ux].multiplicity, sg[vy].multiplicity);
74 
75  DEBUG5(std::cout << "fixing multiplicity of "<<sg.get_edge_name(uv)<<" (len "<<sg[uv].length<<") to the min of "<<sg.get_edge_name(ux)<<" ("<<sg[ux].multiplicity<<") & "<<sg.get_edge_name(vy)<<" ("<<sg[vy].multiplicity<<")"<<std::endl);
76  }
77  }// for all non-contig edges of sg
78  }
79 
80  // cut all non-contigs whose weight is below a certain threshold, but don't decrease the degree below min_deg
81  void cut_off_threshold(ScaffoldGraph& sg, const unsigned threshold, const unsigned min_degree = 0)
82  {
83  if(!threshold) return;
84  const RawScaffoldGraph& g = sg.get_graph();
85  for(ScafEdgeIterRange r = boost::edges(g); r.first != r.second;){
86  const ScafEdge& uv = *r.first;
87  const ScafEdgeProperty& uv_info = g[uv];
88  if(!uv_info.is_matching_edge()){
89  const ScafVertex& u = boost::source(uv, g);
90  const ScafVertex& v = boost::target(uv, g);
91  const unsigned uv_weight = g[uv].weight;
92  if(uv_weight < threshold){
93  if((min_degree < 2) || ((boost::degree(u, g) > min_degree) && (boost::degree(v, g) > min_degree))){
94  const ScafEdgeIter to_del = r.first;
95  ++r.first;
96  sg.delete_edge(*to_del);
97  } else ++r.first;
98  } else ++r.first;
99  } else ++r.first;
100  }// for
101  }
102 
103  // divide each weight by the # of times this edge can be in any solution
104  // NOTE: assumes that non-matching edges have multiplicities!
105  // if this is not the case, you should call fix_non_matching_multiplicities before calling this function!
106  void scale_weights(ScaffoldGraph& sg)
107  {
108  const RawScaffoldGraph& g = sg.get_graph();
109  for(auto er = boost::edges(g); er.first != er.second; ++er.first) {
110  const ScafEdge& uv = *er.first;
111  ScafEdgeProperty& uv_info = sg[uv];
112  if(!uv_info.is_matching_edge()){
113  const unsigned uv_multi = uv_info.multiplicity;
114  DEBUG5(std::cout << "scaling "<<sg.get_edge_name(uv)<<" of multiplicity "<<uv_multi<<std::endl);
115 #warning TODO: either use float as weight or scale up everything by the max multiplicity?
116  if(uv_multi != 1) uv_info.weight /= uv_multi;
117  }// if uv is not a matching edge
118  }// for all edges
119  }// function
120 
121  // set up vertex names as integers by their indices
122  void setup_vertex_names(RawScaffoldGraph& g)
123  {
124  for(ScafVIterRange r = boost::vertices(g); r.first != r.second; ++r.first) {
125  ScafVertexProperty& v_prop = g[*r.first];
126  if(v_prop.name.empty()) v_prop.index_to_name();
127  }
128  }
129 
130 
131  // clear all non-matching edges incident to a given vertex
132  void clear_nonmatching(const ScafVertex& u, RawScaffoldGraph& g)
133  {
134  for(auto e_range = boost::out_edges(u, g); e_range.first != e_range.second;){
135  const ScafEdge e = *e_range.first;
136  ++e_range.first;
137  if(!g[e].is_matching_edge())
138  boost::remove_edge(e, g);
139 
140  }// for all vertices
141  }// function
142 
143  // return whether u is the start of a contig
144  bool is_sequence_start(const ScafVertex& u, const RawScaffoldGraph& g){
145  // u is a contig start if its degree is 1 or at least 3
146  if(boost::degree(u, g) == 2) {
147  // if the degree of u is 2, then it depends on the multiplicity of the incident edges
148  auto e_iter = boost::out_edges(u, g).first;
149  const ScafEdge& e1 = *(e_iter++);
150  const ScafEdge& e2 = *e_iter;
151  return (g[e1].multiplicity != g[e2].multiplicity);
152  } else return true;
153  }
154 
155  // forward declaration, see details below
156  void get_alternating_paths_of_max_length(const ScaffoldGraph& sg,
157  const ScafVertex& u,
158  const unsigned max_length,
159  std::list<AlternatingPath>& result,
160  const bool start_with_matched,
161  ScafVertexSet* const _forbidden = NULL);
162 
163  // get alternating paths of length max_length starting with the edge uv & avoiding the forbidden vertices
164  // (see the next function for details)
165  void get_alternating_paths_of_max_length(const ScaffoldGraph& sg,
166  const ScafEdge& uv,
167  const ScafVertex& v,
168  const unsigned max_length,
169  std::list<AlternatingPath>& result,
170  const bool start_with_matched,
171  ScafVertexSet* const forbidden)
172  {
173  assert(forbidden);
174  if(!contains(*forbidden, v)){
175  std::list<AlternatingPath> paths_from_v;
176  get_alternating_paths_of_max_length(sg, v, max_length, paths_from_v, start_with_matched, forbidden);
177 
178  // prepend uv to all paths from v (create a single empty path if there are none)
179  if(paths_from_v.empty()) paths_from_v.emplace_back(AlternatingPath());
180  for(auto& p: paths_from_v) p.emplace_front(uv);
181 
182  // add the resulting paths to the result
183  result.splice(result.cend(), paths_from_v);
184  }
185  }
186 
187  // compute all alternating paths starting in u such that the total contig length does not exceed max_length
188  // if start_with_matched is set, consider only paths starting with a matched edge
189  // do not consider paths going to vertices in "forbidden"
190  void get_alternating_paths_of_max_length(const ScaffoldGraph& sg,
191  const ScafVertex& u,
192  const unsigned max_length,
193  std::list<AlternatingPath>& result,
194  const bool start_with_matched,
195  ScafVertexSet* const _forbidden)
196  {
197  ScafVertexSet* const forbidden = (_forbidden ? _forbidden : new ScafVertexSet());
198  forbidden->emplace(u);
199  const ScafVertex& u_match = sg.matched_with(u);
200  if(start_with_matched){
201  const ScafEdge uv = sg.find_edge(u, u_match).first;
202  const ScafEdgeProperty& uv_info = sg[uv];
203  const unsigned uv_length = ( start_with_matched ? uv_info.length : 0);
204  if(uv_length <= max_length)
205  get_alternating_paths_of_max_length(sg, uv, u_match, max_length - uv_length, result, false, forbidden);
206  } else {
207  const RawScaffoldGraph& g = sg.get_graph();
208  for(auto range = boost::out_edges(u, g); range.first != range.second; ++range.first){
209  const ScafEdge uv = *range.first;
210  const ScafVertex& v = boost::target(uv, g);
211  if(v != u_match)
212  get_alternating_paths_of_max_length(sg, uv, v, max_length, result, true, forbidden);
213  }// for all incident with u
214  }
215  // clean up the forbidden set if we created it
216  if(!_forbidden) delete forbidden;
217  }// function
218 
219 
220  // given a solution to a scaffold graph
221  // deconstruct the solution graph into a set of disjoint alternating paths & cycles
222 #warning TODO: optimize this avoiding all the string copies (maybe use a "fragmented string" class?)
223  void deconstruct_solution(ScaffoldGraph& sg)
224  {
225 #warning TODO: use an external mapping of edgenames to contig names. Right now, we copy the contig names each time we copy the graph!
226 
227  // step 1: break multiplicities in a RawScaffoldGraph by deleting edges incident to endpoints of ambigous paths
228  DEBUG5(std::cout << "removing ambigous paths..."<<std::endl;)
229  kill_ambigous_paths_brutal(sg);
230 
231  // step 2: deconstruct
232  DEBUG5(std::cout << "deconstructing the graph..."<<std::endl;)
233  const RawScaffoldGraph& g(sg.get_graph());
234 
235  boost::unordered_set<ScafVertex> to_check;
236  for(auto range = boost::vertices(g); range.first != range.second; ++range.first)
237  to_check.insert(*range.first);
238 
239  while(!to_check.empty()){
240  const ScafVertex v = *(to_check.cbegin());
241  to_check.erase(to_check.cbegin());
242 
243  if(boost::degree(v, g) > 1){
244  // if u has non-matching edges
245  const ScafEdge& uv = sg.incident_matching_edge(v);
246  const ScafVertex& u = boost::target(uv, g);
247  const ScafEdge vw = *(sg.get_incident_non_matching(v));
248  const ScafVertex& w = boost::target(vw, g);
249  const unsigned uv_multi = sg[uv].multiplicity;
250  const unsigned vw_multi = sg[vw].multiplicity;
251  // if v is incident to a non-matching edge vw of different multiplicity than uv, split off vw
252  if(uv_multi != vw_multi){
253  DEBUG5(std::cout << "detected break point at "<<sg[u].name<<" --"<<sg[uv] <<"--> "<<sg[v].name<<" --x"<<sg[vw]<<"--> "<<sg[w].name<<std::endl;)
254 
255  // add new u'-v'-w
256  const ScafVertex u_prime = sg.add_vertex(u);
257  const ScafVertex v_prime = sg.add_vertex(v);
258 
259  const ScafEdge uv_prime = sg.add_matching_edge(u_prime, v_prime, sg[uv]).first;
260  const ScafEdge vw_prime = sg.add_edge(v_prime, w, sg[vw]).first;
261  sg[uv].multiplicity -= vw_multi;
262  sg[uv_prime].multiplicity = vw_multi;
263  DEBUG5(std::cout << "added matching edge "<<sg.get_edge_name(uv_prime)<<" ["<<sg[uv_prime]<<"]" <<std::endl;)
264  DEBUG5(std::cout << "added edge "<<sg.get_edge_name(vw_prime)<<" ["<<sg[vw_prime]<<"]" <<std::endl;)
265 
266  // force another iteration for v & w
267  to_check.insert(v);
268  to_check.insert(w);
269 
270  // add y-u' if u was not deg-1
271  if(sg.degree(u) > 1){
272  assert(sg.degree(u) == 2);
273  const ScafEdge& uy = *(sg.get_incident_non_matching(u));
274  const ScafVertex& y = boost::target(uy, g);
275  const ScafEdge yu_prime = sg.add_edge(y, u_prime, sg[uy]).first;
276  sg[uy].multiplicity -= vw_multi;
277  sg[yu_prime].multiplicity = vw_multi;
278  DEBUG5(std::cout << "added edge "<<sg.get_edge_name(yu_prime)<<" ["<<sg[yu_prime]<<"]" <<std::endl;)
279  assert(sg[uy].multiplicity == sg[uv].multiplicity);
280  // check y next
281  to_check.insert(y);
282  } // if deg(u) == 2
283 
284  // delete vw
285  sg.delete_edge(vw);
286  }// if uv & vw have different multiplicity
287  }// if v has non-matching edges
288  }// while there are vertices to check
289  }// function
290 
291 
292  // return the index behind the real name if name is "(real_name)(rev)" and 0 otherwise
293  unsigned is_reversed_sequence(const std::string& name)
294  {
295  unsigned start_of_rev_indicator = name.length() - strlen(REVERSE_SEQUENCE_INDICATOR);
296  const unsigned end_of_name = start_of_rev_indicator - 1;
297  if(name[0] != '(') return 0;
298  if(name[end_of_name] != ')') return 0;
299  if(name.substr(start_of_rev_indicator) != REVERSE_SEQUENCE_INDICATOR) return 0;
300  // check that the bracket at index 0 really does belong to the one at end_of_name
301  unsigned level = 1;
302  const char* name_cptr = name.c_str();
303  for(unsigned i = 1; i != end_of_name; ++i){
304  const char c = name_cptr[i];
305  switch(c){
306  case '(': ++level; break;
307  case ')': --level; break;
308  }
309  if(level == 0) return 0;
310  }
311  return start_of_rev_indicator;
312  }
313 
314  // change name as to indicate that it's referring to a reversed sequence
315  void indicate_reversal(std::string& name)
316  {
317  // if the sequence is already reversed, just remove the reverse indicator, otherwise, add a reverse indicator
318  const unsigned start_of_rev_indicator = is_reversed_sequence(name);
319  if(start_of_rev_indicator){
320  name.erase(start_of_rev_indicator-1);
321  name.erase(0, 1);
322  } else name = '(' + name + ')' + REVERSE_SEQUENCE_INDICATOR;
323  }
324 
325  std::string get_reversed_name(const std::string& name)
326  {
327  std::string result = name;
328  indicate_reversal(result);
329  return result;
330  }
331 
332  // get the complement of the given base, or 'N' if it does not have a complement
333  char get_complement(const char base)
334  {
335  const char* complement_index = std::strchr(COMPLEMENTARY_BASES, base);
336  if(complement_index != NULL){
337  const int offset = (complement_index - COMPLEMENTARY_BASES);
338  assert(offset >= 0);
339  char result = COMPLEMENTARY_BASES[(offset % 2) ? offset - 1 : offset + 1];
340  return result;
341  } else return 'N';
342  }
343 
344  // reverse complement a given string in place
345  void reverse_complement_inplace(std::string& sequence)
346  {
347  const int seq_len = sequence.length();
348  const int last_index = seq_len - 1;
349  for(int i = 0; 2 * i < last_index; ++i){
350  const char swap_base = sequence[i];
351  sequence[i] = get_complement(sequence[last_index - i]);
352  sequence[last_index - i] = get_complement(swap_base);
353  }
354  // reverse the middle if the sequence length is even
355  if(seq_len % 2) sequence[last_index / 2] = get_complement(sequence[last_index / 2]);
356  }
357 
358  // return the reverse complement of a sequence
359  std::string reverse_complement(const std::string& sequence)
360  {
361  std::string out(sequence);
362  reverse_complement_inplace(out);
363  return out;
364  }
365 
366 
367  // return the reverse complement of a sequence, modifying its name to account for it
368  std::string named_reverse_complement(const std::string& sequence, std::string& name)
369  {
370  std::string out(sequence);
371  reverse_complement_inplace(out);
372  indicate_reversal(name);
373  return out;
374  }
375 
376  // turn a simple solution graph (only paths & cycles) into a collection of named sequences
377  void simple_solution_to_sequences(const ScaffoldGraph& sg, const SequenceMap& sequences, SequenceMap& out)
378  {
379  const RawScaffoldGraph& g = sg.get_graph();
380  boost::unordered_set<ScafVertex> to_check_deg1, to_check_deg2;
381  for(auto range = boost::vertices(g); range.first != range.second; ++range.first)
382  if(sg.degree(*range.first) == 1)
383  to_check_deg1.insert(*range.first);
384  else
385  to_check_deg2.insert(*range.first);
386 
387  // handle paths first
388  while(!to_check_deg1.empty() && !to_check_deg2.empty()){
389  boost::unordered_set<ScafVertex>& to_check = to_check_deg1.empty() ? to_check_deg2 : to_check_deg1;
390  ScafVertex u = *(to_check.cbegin());
391  to_check.erase(u);
392 
393  DEBUG5(std::cout << "writing down sequence starting at "<<sg[u].name<<": "<<std::endl);
394 
395  // grow a name & sequence along the path/cycle
396  std::string current_name;
397  std::string current_seq;
398 
399  // move along the path & collect the sequences into the output map
400  const ScafVertex cycle_start = u;
401  while(true){
402  // step 1: find the incident matching edge
403  const ScafEdge uv = sg.incident_matching_edge(u);
404  const ScafVertex v = boost::target(uv, g);
405  std::string contig_name = sg[uv].contig_name;
406 
407  // step 2: write down the sequence of uv
408  const OrientedSequence& os = sequences.at(contig_name);
409  DEBUG5(std::cout << contig_name << " (@"<<os.start_vertex<<") ");
410  if(os.start_vertex == sg[u].name){
411  current_seq += os.sequence;
412  current_name += contig_name;
413  } else {
414  assert(os.start_vertex == sg[v].name);
415  current_seq += named_reverse_complement(os.sequence, contig_name);
416  current_name += contig_name;
417  }
418 
419  // step 3: remove u & v from the queue
420  to_check_deg2.erase(u);
421 
422  // step 4: advance along the path, writing down 'N's
423  const unsigned v_deg = sg.degree(v);
424  assert(v_deg <= 2);
425  if(v_deg == 2){
426  to_check_deg2.erase(v);
427  const ScafEdge vw = *(sg.get_incident_non_matching(v));
428 
429  // add 'N's unless vw has NO_LENGTH
430  const int vw_length = sg[vw].length;
431  if(vw_length != NO_LENGTH){
432  assert(sg[vw].length >= 0);
433  current_seq += std::string(sg[vw].length, 'N');
434  }
435 
436  u = boost::target(vw, g);
437  // quit the loop if we arrived at our start point
438  if(u == cycle_start) {
439  // if we detect a cycle, add the cyclic indicator to the sequence name
440  current_name += CYCLIC_SEQUENCE_INDICATOR;
441  break;
442  } else current_name += '+';
443  } else {
444  to_check_deg1.erase(v);
445  break;
446  }
447  }// loop
448  DEBUG5(std::cout << std::endl);
449  // make sure we're not overwriting a sequence
450  auto seq_it = out.find(current_name);
451  while(seq_it != out.cend()){
452  current_name += '\'';
453  seq_it = out.find(current_name);
454  }
455  // register the name & sequence in the output map
456  out.emplace_hint(seq_it, current_name, current_seq);
457  }// while there are degree-one vertices
458  }// function
459 
460 
461 } // namespace
462 
463 #endif
Definition: graph_utils.hpp:18
Definition: read_adj_list.hpp:22
bool contains(const Set &s, const Element &el)
a more readable containment check
Definition: utils.hpp:171