Scaffolding  0.1
This program can assemble genome scaffolds using the pairing information in paired-end reads.
graph_utils.hpp
1 
2 
3 #ifndef GRAPH_UTILS_HPP
4 #define GRAPH_UTILS_HPP
5 
6 
7 #include <stack>
8 #include <boost/unordered_set.hpp>
9 #include <boost/graph/max_cardinality_matching.hpp>
10 //#include <boost/graph/copy.hpp>
11 #include "utils/copy.hpp"
12 
13 #include "utils.hpp"
14 #include "graph_typedefs.hpp"
15 
16 using namespace boost;
17 
18 namespace boost{
19  // strangely, boost::filtered_graph misses degree()
20  template<class Graph, typename EPred, typename VPred>
21  size_t degree(const Vertex<filtered_graph<Graph,EPred,VPred> >& v, const filtered_graph<Graph,EPred,VPred>& g){
22  return out_degree(v, g);
23  }
24 }
25 
26 namespace scaffold{
27 
28  // remove a matching pair, given only one of its two vertices
29  // return whether u was found and removed
30  template<class Graph>
31  bool remove_matching_pair(const Vertex<Graph>& u, Matching<Graph>& match){
32  const typename Matching<Graph>::iterator uv = match.find(u);
33  if(uv != match.end()){
34  const typename Matching<Graph>::iterator vu = match.find(uv->second);
35  match.erase(uv);
36  match.erase(vu);
37  return true;
38  } else return false;
39  }
40 
41 
42  // set up internal vertex indices by order of occurance in vertices(g)
43  template<class Graph>
44  void setup_vertex_indices(Graph& g, unsigned first_index = 0){
45  for(VertexIterRange<Graph> r = boost::vertices(g); r.first != r.second;++r.first)
46  g[*r.first].index = first_index++;
47  }// function
48 
49  // compute the FES from a given number of vertices, edges, and connected components
50  size_t compute_FES(const size_t v, const size_t e, const size_t cc){
51  return e + cc - v;
52  }
53 
54 
55  // split off a component of g and return it
56  // MOTE: while the remainder of g might contain more components, the split-off graph is connected!
57  template<class Graph>
58  Graph* split_off_component(Graph& g, Matching<Graph>* _translate = NULL, Matching<Graph>* _translate_back = NULL){
59  typedef typename Graph::vertex_bundled VertexInfo;
60  Matching<Graph>* translate = _translate ? _translate : new Matching<Graph>();
61  Matching<Graph>* translate_back = _translate_back ? _translate_back : new Matching<Graph>();
62 
63  assert(boost::num_vertices(g));
64  const Vertex<Graph> source = *boost::vertices(g).first;
65  Graph* h = new Graph;
66 
67  // Step 1: copy the component of the first vertex of g
68  DEBUG4(std::cout << "copying component of "<< VertexAndGraph<Graph>(source, g)<<std::endl);
69  boost::copy_component(g, source, *h, orig_to_copy(boost::associative_property_map<Matching<Graph> >(*translate)).vertex_index_map(boost::get(&VertexInfo::index, g)));
70  // if g didn't have at least 2 components, return NULL
71  if(boost::num_vertices(g) == boost::num_vertices(*h)){ delete h; return NULL;}
72  // Step 1a: invert the translation
73  for(auto& i : *translate) translate_back->emplace(i.second, i.first);
74  // Step 2: delete all translations of vertices of the copy
75  for(VertexIterRange<Graph> r = vertices(*h); r.first != r.second; ++r.first){
76  const Vertex<Graph> u = translate_back->at(*r.first);
77  boost::clear_vertex(u, g);
78  boost::remove_vertex(u, g);
79  }// for
80  if(!_translate) delete translate;
81  if(!_translate_back) delete translate_back;
82  // update indices and infos
83  setup_vertex_indices(g);
84  setup_vertex_indices(*h);
85  return h;
86  }
87 
88 
89  // NOTE: this assumes that g is connected
90  // set a map that assigns a number x to an edge e iff e is a bridge splitting away a component containing x feedback edges
91  // implements a modification of Tarjan's bridge finding algorithm
92  template<class Graph>
93  void mark_bridges(const Graph& g, BridgeMap<Graph>& bridge_map){
94  // set up infrastructure for Tarjan's algorithm
95  struct TarjanInfos{ // modified Tarjan infos for a vertex v
96  size_t index, // DFS index of v
97  lowest_seen_index, // the lowest index that is at most 1 away from the DFS subtree rooted at v
98  subtree_size, // the number of vertices in the DFS subtree rooted at v
99  subtree_edges; // the number of edges in the DFS subtree rooted at v
100  const Vertex<Graph> parent; // the parent of v in the DFS tree
101  // setup Tarjan infos with just an index and subtree size 1
102  TarjanInfos(const size_t _index, const Vertex<Graph>& _parent):
103  index(_index), lowest_seen_index(_index), subtree_size(1), subtree_edges(0), parent(_parent) {}
104  };
105  // prepare a DFS vertex stack
106  struct StackElement {
107  Vertex<Graph> v;
108  AdjIterRange<Graph> adj;
109  // constructor
110  StackElement(const Vertex<Graph>& _v, const AdjIterRange<Graph>& _adj):
111  v(_v),
112  adj(_adj) {}
113  };
114 
115  // keep a map of tarjan infos of visited vertices
116  unordered_map<Vertex<Graph>, TarjanInfos> tarjan_infos;
117  // if there are no edges, there are also no bridges
118  if(!num_edges(g)) return;
119  // DFS root will be the first vertex of g
120  VertexIter<Graph> root = vertices(g).first;
121  std::stack<StackElement > adj_stack;
122  // give the root an index and mark it visited
123  tarjan_infos.emplace(std::piecewise_construct, std::make_tuple(*root), std::make_tuple(0, *root));
124  // add the root on top of the stack
125 // adj_stack.emplace(std::piecewise_construct, std::make_tuple(*root, adjacent_vertices(*root, g), false));
126  adj_stack.emplace(*root, adjacent_vertices(*root, g));
127  // assert that the root is not isolated
128  assert(adj_stack.top().adj.first != adj_stack.top().adj.second);
129 
130  // walk through the graph in DFS, calculating indices on the way down and the other Tarjan Infos on the way back up
131  size_t current_index = 0;
132  while(!adj_stack.empty()){
133  // get (v, r) from the stack
134  StackElement& e = adj_stack.top();
135  const Vertex<Graph>& v = e.v;
136  AdjIterRange<Graph>& r = e.adj;
137  TarjanInfos& v_infos(tarjan_infos.at(v));
138 
139  // check whether we're going up or down the DFS tree
140  if(contains(tarjan_infos, *r.first)){
141  const TarjanInfos& u_infos(tarjan_infos.at(*r.first));
142  // we're coming up from u
143  DEBUG5(std::cout << "coming up from "<<VertexAndGraph<Graph>(*r.first,g)<<" who has seen "<<u_infos.lowest_seen_index<<" and his subtree has "<<u_infos.subtree_size<<" verts and "<<u_infos.subtree_edges<<" edges (we are at "<<VertexAndGraph<Graph>(v,g)<<" with index "<<v_infos.index<<")"<<std::endl);
144  // if the lowest seen index of u is smaller than the index of v, then uv is a bridge splitting u's subtree size
145  if(u_infos.lowest_seen_index > v_infos.index){
146  // compute the FES that is split away
147  const size_t FES = compute_FES(u_infos.subtree_size, u_infos.subtree_edges, 1);
148  // insert the bridge
149  bridge_map.emplace(std::piecewise_construct, std::make_tuple(v, *r.first), std::make_tuple(FES));
150  DEBUG5(std::cout<<"found bridge ("<<VertexAndGraph<Graph>(v,g)<<","<<VertexAndGraph<Graph>(*r.first,g)<<") splitting away "<<u_infos.subtree_size<<" vertices and "<<u_infos.subtree_edges<<" edges (FES "<<FES<<")"<<std::endl);
151  } else // otherwise, update v's lowest seen index
152  v_infos.lowest_seen_index = std::min(v_infos.lowest_seen_index, u_infos.lowest_seen_index);
153  // update subtree infos of v
154  v_infos.subtree_size += u_infos.subtree_size;
155  v_infos.subtree_edges += u_infos.subtree_edges + 1;
156  // skip to the next unvisited vertex, updating lowest_seen_index of v with the index of the neighbor and updating the #edges
157  ++r.first;
158  for( ; (r.first != r.second) && contains(tarjan_infos, *r.first); ++r.first){
159  const TarjanInfos& w_infos(tarjan_infos.at(*r.first));
160  // don't update smallest seen index if w is our parent or w is below us
161  if((*r.first != v_infos.parent) && (w_infos.index < v_infos.index))
162  v_infos.lowest_seen_index = std::min(v_infos.lowest_seen_index, w_infos.index);
163  // if w is below us, then update the edges in our subtree
164  if(w_infos.index > v_infos.index) ++v_infos.subtree_edges;
165  }
166  } // if
167  if(r.first != r.second){
168  // we're going down to *r.first, which is guaranteed to not have been visited at this point
169  const Vertex<Graph>& u = *r.first;
170  DEBUG5(std::cout << "going down to unvisited "<<VertexAndGraph<Graph>(u,g)<<" from "<<VertexAndGraph<Graph>(v,g)<<std::endl);
171  // set up the tarjan infos of u
172  TarjanInfos& u_infos = tarjan_infos.emplace(std::piecewise_construct, std::make_tuple(u), std::make_tuple(++current_index, v)).first->second;
173  // before pushing, skip through u's neighbors so that the first one is unvisited, keeping lowest seen index up to date
174  AdjIterRange<Graph> u_adj = adjacent_vertices(u, g);
175  for( ; (u_adj.first != u_adj.second) ? contains(tarjan_infos, *u_adj.first) : false; ++u_adj.first)
176  if(*u_adj.first != v) // don't update for v
177  u_infos.lowest_seen_index = std::min(u_infos.lowest_seen_index, tarjan_infos.at(*u_adj.first).index);
178  // if u is not a leaf of the DFS tree, then push u to the stack, otherwise, continue with v on the stack as "coming up from u"
179  if(u_adj.first != u_adj.second) adj_stack.push(StackElement(u, u_adj));
180  } else adj_stack.pop(); // if we arrived at the end, then pop v from the stack
181  } // while
182  DEBUG4(std::cout << "done finding "<<bridge_map.size()<<" bridges"<<std::endl);
183  } // function
184 
185 
186  // return the number of isolated paths & cycles, provided that the maximum degree is 2
187  template<class Graph>
188  PathsAndCycles get_num_pc(const Graph& g){
189 
190  PathsAndCycles result;
191  VertexSet<Graph> marked;
192 
193  // first, find all paths
194  for(VertexIterRange<Graph> i = vertices(g); i.first != i.second; ++i.first){
195  const Vertex<Graph>& v = *(i.first);
196  if(degree(v, g) <= 1 && !contains(marked, v)){
197  // found another path
198  result.p++;
199  marked.insert(v);
200 
201  if(degree(v, g) == 1) {
202  // if we have a non-singleton path, go through it, marking the vertices
203  Vertex<Graph> u = v;
204  Vertex<Graph> last = u;
205  do{
206  AdjIter<Graph> nxt = adjacent_vertices(u, g).first;
207  if(*nxt == last) ++nxt;
208  last = u;
209  u = *nxt;
210  marked.insert(u);
211  } while(degree(u, g) == 2);
212  DEBUG5(std::cout << "finished path at "<<VertexAndGraph<Graph>(u,g)<<" with degree "<<degree(u, g)<<std::endl);
213  assert(degree(u, g) == 1);
214  } // if
215  } // if
216  } // if/for
217 
218  // second, find all cycles
219  for(VertexIterRange<Graph> i = vertices(g); i.first != i.second; ++i.first) if(!contains(marked, *(i.first))){
220  result.c++;
221  Vertex<Graph> u = *(i.first);
222  Vertex<Graph> last = u;
223  DEBUG3(std::cout << "found cycle starting at "<<VertexAndGraph<Graph>(u, g)<<":");
224  while(!contains(marked, u)){
225  assert(degree(u, g) == 2);
226  marked.insert(u);
227 
228  AdjIter<Graph> nxt = adjacent_vertices(u, g).first;
229  if(*nxt == last) ++nxt;
230  last = u;
231  u = *nxt;
232  DEBUG3(std::cout << VertexAndGraph<Graph>(u, g)<<" ");
233  }// while
234  DEBUG3(std::cout << std::endl);
235  }// if/for
236  return result;
237  }
238 
239 
240  // eat away treelike parts of the graph g, starting form all leaves in pending
241  template<class Graph>
242  void remove_treelike(Graph& g, unordered_set<Vertex<Graph> >* const pending){
243  // keep a set of new vertices
244  unordered_set<Vertex<Graph> > *old_batch = pending;
245  unordered_set<Vertex<Graph> > *new_batch = new unordered_set<Vertex<Graph> >;
246  unordered_set<Vertex<Graph> >* const delete_me = new_batch;
247 
248  while(!new_batch->empty() && !old_batch->empty()){
249  // if the old batch of vertices is used up, get the next batch of vertices in
250  if(old_batch->empty()) swap(new_batch, old_batch);
251  // get the first vertex on the waiting list
252  const Vertex<Graph> v = *(old_batch->begin());
253  old_batch->erase(old_batch->begin());
254  // if v is a leaf, insert its parent into the new batch (if it's not already in the old_batch)
255  if(degree(v, g) == 1){
256  const Vertex<Graph> u = *(adjacent_vertices(v, g).first);
257  if(!contains(*old_batch, u)) new_batch->insert(u);
258  // delete v from g
259  remove_vertex(v, g);
260  } // if
261  } // while
262  // delete the set we initialized and leave "pending" alone
263  delete delete_me;
264  } // function
265 
266 
267  template<class Graph>
268  void remove_treelike(Graph& g){
269  // make a set of all leaves and call remove_treelike for it
270  unordered_set<Vertex<Graph> > leaves;
271  for(VertexIterRange<Graph> r = vertices(g); r.first != r.second; ++r.first)
272  if(degree(*r.first) == 1)
273  leaves.insert(*r.first);
274  remove_treelike(g, &leaves);
275  }
276 
277 
278  // use a translate map to translate a matching
279  template<class Graph>
280  void copy_matching(const Matching<Graph>& from, Matching<Graph>& to, const VTranslateMap<Graph,Graph>& translate)
281  {
282  for(auto i : from) to[translate.at(i.first)] = translate.at(i.second);
283  }
284 
285 
286  // return whether a vertex pair equals an edge
287  template<class Graph>
288  bool is_equal(const VertexPair<Graph>& p, const Edge<Graph>& e, const Graph& g)
289  {
290  const Vertex<Graph>& u = source(e, g);
291  const Vertex<Graph>& v = target(e, g);
292  return ((u == p.first) && (v == p.second)) || ((u == p.second) && (v == p.first));
293  }
294 
295  // remove all vertices from a graph that are reachable from a set/list/queue/whatever of vertices
296  template<class Graph, class Container>
297  void remove_reachable(Graph& g, const Container& U, const Direction d)
298  {
299  VertexSet<Graph> pending(U.begin(), U.end());
300  while(!pending.empty()){
301  const Vertex<Graph> u = *(pending.begin());
302  pending.erase(pending.begin());
303  // step 1: add all neighbors of u to the list of next pending vertices
304  if((d == dir_fwd) || (d == dir_fwdrev))
305  for(OEdgeIterRange<Graph> r = boost::out_edges(u, g); r.first != r.second; ++r.first)
306  pending.insert(boost::target(*r.first, g));
307  if((d == dir_rev) || (d == dir_fwdrev))
308  for(IEdgeIterRange<Graph> r = boost::in_edges(u, g); r.first != r.second; ++r.first)
309  pending.insert(boost::source(*r.first, g));
310  // step 3: remove u from the aux graph
311  boost::clear_vertex(u, g);
312  boost::remove_vertex(u, g);
313  }// while
314  }// function
315 
316 } // namespace boost
317 
318 #endif
319 
320 
Definition: graph_utils.hpp:18
Definition: read_adj_list.hpp:22
bool contains(const Set &s, const Element &el)
a more readable containment check
Definition: utils.hpp:171
Definition: graph_typedefs.hpp:26