3 #ifndef SCAFFOLDING_UTILS_HPP
4 #define SCAFFOLDING_UTILS_HPP
7 #include "utils/graph_typedefs.hpp"
8 #include "utils/scaffolding_typedefs.hpp"
14 using namespace boost;
19 void get_contig_jumps(
const ScaffoldGraph& sg, std::list<contig_jump >& jumps)
21 const RawScaffoldGraph& g = sg.get_graph();
22 const ScafMatching& matching = sg.get_matching();
24 for(
const ScafMatching::value_type& contig: matching){
25 ScafVertex u = contig.first;
26 ScafVertex v = contig.second;
27 const ScafEdge& uv = sg.find_edge(u, v).first;
28 const unsigned contig_length = g[uv].length;
31 if(contig_length < sg.get_graph_property().insert_size){
32 #warning TODO: implement detection of jumps over more than one contig
33 #warning TODO: for any given short contig, only keep the most supported jump
34 if(boost::degree(u, g) == 1) std::swap(u, v);
35 if((boost::degree(u, g) == 2) && (boost::degree(v, g) == 1)){
37 ScafOEdgeIter e = boost::out_edges(u, g).first;
38 if(boost::target(*e, g) == v) ++e;
39 jumps.emplace_back(uv, std::initializer_list<ScafEdge >({*e}));
42 for(ScafOEdgeIterRange u_r = boost::out_edges(u, g); u_r.first != u_r.second; ++u_r.first){
43 const ScafEdge& ux = *u_r.first;
44 const ScafVertex& x = boost::target(ux, g);
46 for(ScafOEdgeIterRange v_r = boost::out_edges(v, g); v_r.first != v_r.second; ++v_r.first){
47 const ScafEdge& vy = *v_r.first;
48 const ScafVertex& y = boost::target(vy, g);
49 if( (y != u) && (matching.at(x) != y)){
50 const std::pair<ScafEdge, bool> xy_pair = boost::edge(x, y, g);
52 jumps.emplace_back(uv, std::initializer_list<ScafEdge >({ux, xy_pair.first, vy}));
63 void fix_non_matching_multiplicities(ScaffoldGraph& sg)
65 const RawScaffoldGraph& g = sg.get_graph();
66 for(
auto range = boost::edges(g); range.first != range.second; ++range.first){
67 const ScafEdge& uv = *range.first;
68 if(!sg[uv].is_matching_edge()){
69 const ScafVertex& u = boost::source(uv, g);
70 const ScafVertex& v = boost::target(uv, g);
71 const ScafEdge ux = sg.incident_matching_edge(u);
72 const ScafEdge vy = sg.incident_matching_edge(v);
73 sg[uv].multiplicity = std::min(sg[ux].multiplicity, sg[vy].multiplicity);
75 DEBUG5(std::cout <<
"fixing multiplicity of "<<sg.get_edge_name(uv)<<
" (len "<<sg[uv].length<<
") to the min of "<<sg.get_edge_name(ux)<<
" ("<<sg[ux].multiplicity<<
") & "<<sg.get_edge_name(vy)<<
" ("<<sg[vy].multiplicity<<
")"<<std::endl);
81 void cut_off_threshold(ScaffoldGraph& sg,
const unsigned threshold,
const unsigned min_degree = 0)
83 if(!threshold)
return;
84 const RawScaffoldGraph& g = sg.get_graph();
85 for(ScafEdgeIterRange r = boost::edges(g); r.first != r.second;){
86 const ScafEdge& uv = *r.first;
87 const ScafEdgeProperty& uv_info = g[uv];
88 if(!uv_info.is_matching_edge()){
89 const ScafVertex& u = boost::source(uv, g);
90 const ScafVertex& v = boost::target(uv, g);
91 const unsigned uv_weight = g[uv].weight;
92 if(uv_weight < threshold){
93 if((min_degree < 2) || ((boost::degree(u, g) > min_degree) && (boost::degree(v, g) > min_degree))){
94 const ScafEdgeIter to_del = r.first;
96 sg.delete_edge(*to_del);
106 void scale_weights(ScaffoldGraph& sg)
108 const RawScaffoldGraph& g = sg.get_graph();
109 for(
auto er = boost::edges(g); er.first != er.second; ++er.first) {
110 const ScafEdge& uv = *er.first;
111 ScafEdgeProperty& uv_info = sg[uv];
112 if(!uv_info.is_matching_edge()){
113 const unsigned uv_multi = uv_info.multiplicity;
114 DEBUG5(std::cout <<
"scaling "<<sg.get_edge_name(uv)<<
" of multiplicity "<<uv_multi<<std::endl);
115 #warning TODO: either use float as weight or scale up everything by the max multiplicity?
116 if(uv_multi != 1) uv_info.weight /= uv_multi;
122 void setup_vertex_names(RawScaffoldGraph& g)
124 for(ScafVIterRange r = boost::vertices(g); r.first != r.second; ++r.first) {
125 ScafVertexProperty& v_prop = g[*r.first];
126 if(v_prop.name.empty()) v_prop.index_to_name();
132 void clear_nonmatching(
const ScafVertex& u, RawScaffoldGraph& g)
134 for(
auto e_range = boost::out_edges(u, g); e_range.first != e_range.second;){
135 const ScafEdge e = *e_range.first;
137 if(!g[e].is_matching_edge())
138 boost::remove_edge(e, g);
144 bool is_sequence_start(
const ScafVertex& u,
const RawScaffoldGraph& g){
146 if(boost::degree(u, g) == 2) {
148 auto e_iter = boost::out_edges(u, g).first;
149 const ScafEdge& e1 = *(e_iter++);
150 const ScafEdge& e2 = *e_iter;
151 return (g[e1].multiplicity != g[e2].multiplicity);
156 void get_alternating_paths_of_max_length(
const ScaffoldGraph& sg,
158 const unsigned max_length,
159 std::list<AlternatingPath>& result,
160 const bool start_with_matched,
161 ScafVertexSet*
const _forbidden = NULL);
165 void get_alternating_paths_of_max_length(
const ScaffoldGraph& sg,
168 const unsigned max_length,
169 std::list<AlternatingPath>& result,
170 const bool start_with_matched,
171 ScafVertexSet*
const forbidden)
175 std::list<AlternatingPath> paths_from_v;
176 get_alternating_paths_of_max_length(sg, v, max_length, paths_from_v, start_with_matched, forbidden);
179 if(paths_from_v.empty()) paths_from_v.emplace_back(AlternatingPath());
180 for(
auto& p: paths_from_v) p.emplace_front(uv);
183 result.splice(result.cend(), paths_from_v);
190 void get_alternating_paths_of_max_length(
const ScaffoldGraph& sg,
192 const unsigned max_length,
193 std::list<AlternatingPath>& result,
194 const bool start_with_matched,
195 ScafVertexSet*
const _forbidden)
197 ScafVertexSet*
const forbidden = (_forbidden ? _forbidden :
new ScafVertexSet());
198 forbidden->emplace(u);
199 const ScafVertex& u_match = sg.matched_with(u);
200 if(start_with_matched){
201 const ScafEdge uv = sg.find_edge(u, u_match).first;
202 const ScafEdgeProperty& uv_info = sg[uv];
203 const unsigned uv_length = ( start_with_matched ? uv_info.length : 0);
204 if(uv_length <= max_length)
205 get_alternating_paths_of_max_length(sg, uv, u_match, max_length - uv_length, result,
false, forbidden);
207 const RawScaffoldGraph& g = sg.get_graph();
208 for(
auto range = boost::out_edges(u, g); range.first != range.second; ++range.first){
209 const ScafEdge uv = *range.first;
210 const ScafVertex& v = boost::target(uv, g);
212 get_alternating_paths_of_max_length(sg, uv, v, max_length, result,
true, forbidden);
216 if(!_forbidden)
delete forbidden;
222 #warning TODO: optimize this avoiding all the string copies (maybe use a "fragmented string" class?)
223 void deconstruct_solution(ScaffoldGraph& sg)
225 #warning TODO: use an external mapping of edgenames to contig names. Right now, we copy the contig names each time we copy the graph!
228 DEBUG5(std::cout <<
"removing ambigous paths..."<<std::endl;)
229 kill_ambigous_paths_brutal(sg);
232 DEBUG5(std::cout <<
"deconstructing the graph..."<<std::endl;)
233 const RawScaffoldGraph& g(sg.get_graph());
235 boost::unordered_set<ScafVertex> to_check;
236 for(
auto range = boost::vertices(g); range.first != range.second; ++range.first)
237 to_check.insert(*range.first);
239 while(!to_check.empty()){
240 const ScafVertex v = *(to_check.cbegin());
241 to_check.erase(to_check.cbegin());
243 if(boost::degree(v, g) > 1){
245 const ScafEdge& uv = sg.incident_matching_edge(v);
246 const ScafVertex& u = boost::target(uv, g);
247 const ScafEdge vw = *(sg.get_incident_non_matching(v));
248 const ScafVertex& w = boost::target(vw, g);
249 const unsigned uv_multi = sg[uv].multiplicity;
250 const unsigned vw_multi = sg[vw].multiplicity;
252 if(uv_multi != vw_multi){
253 DEBUG5(std::cout <<
"detected break point at "<<sg[u].name<<
" --"<<sg[uv] <<
"--> "<<sg[v].name<<
" --x"<<sg[vw]<<
"--> "<<sg[w].name<<std::endl;)
256 const ScafVertex u_prime = sg.add_vertex(u);
257 const ScafVertex v_prime = sg.add_vertex(v);
259 const ScafEdge uv_prime = sg.add_matching_edge(u_prime, v_prime, sg[uv]).first;
260 const ScafEdge vw_prime = sg.add_edge(v_prime, w, sg[vw]).first;
261 sg[uv].multiplicity -= vw_multi;
262 sg[uv_prime].multiplicity = vw_multi;
263 DEBUG5(std::cout <<
"added matching edge "<<sg.get_edge_name(uv_prime)<<
" ["<<sg[uv_prime]<<
"]" <<std::endl;)
264 DEBUG5(std::cout <<
"added edge "<<sg.get_edge_name(vw_prime)<<
" ["<<sg[vw_prime]<<
"]" <<std::endl;)
271 if(sg.degree(u) > 1){
272 assert(sg.degree(u) == 2);
273 const ScafEdge& uy = *(sg.get_incident_non_matching(u));
274 const ScafVertex& y = boost::target(uy, g);
275 const ScafEdge yu_prime = sg.add_edge(y, u_prime, sg[uy]).first;
276 sg[uy].multiplicity -= vw_multi;
277 sg[yu_prime].multiplicity = vw_multi;
278 DEBUG5(std::cout <<
"added edge "<<sg.get_edge_name(yu_prime)<<
" ["<<sg[yu_prime]<<
"]" <<std::endl;)
279 assert(sg[uy].multiplicity == sg[uv].multiplicity);
293 unsigned is_reversed_sequence(
const std::string& name)
295 unsigned start_of_rev_indicator = name.length() - strlen(REVERSE_SEQUENCE_INDICATOR);
296 const unsigned end_of_name = start_of_rev_indicator - 1;
297 if(name[0] !=
'(')
return 0;
298 if(name[end_of_name] !=
')')
return 0;
299 if(name.substr(start_of_rev_indicator) != REVERSE_SEQUENCE_INDICATOR)
return 0;
302 const char* name_cptr = name.c_str();
303 for(
unsigned i = 1; i != end_of_name; ++i){
304 const char c = name_cptr[i];
306 case '(': ++level;
break;
307 case ')': --level;
break;
309 if(level == 0)
return 0;
311 return start_of_rev_indicator;
315 void indicate_reversal(std::string& name)
318 const unsigned start_of_rev_indicator = is_reversed_sequence(name);
319 if(start_of_rev_indicator){
320 name.erase(start_of_rev_indicator-1);
322 }
else name =
'(' + name +
')' + REVERSE_SEQUENCE_INDICATOR;
325 std::string get_reversed_name(
const std::string& name)
327 std::string result = name;
328 indicate_reversal(result);
333 char get_complement(
const char base)
335 const char* complement_index = std::strchr(COMPLEMENTARY_BASES, base);
336 if(complement_index != NULL){
337 const int offset = (complement_index - COMPLEMENTARY_BASES);
339 char result = COMPLEMENTARY_BASES[(offset % 2) ? offset - 1 : offset + 1];
345 void reverse_complement_inplace(std::string& sequence)
347 const int seq_len = sequence.length();
348 const int last_index = seq_len - 1;
349 for(
int i = 0; 2 * i < last_index; ++i){
350 const char swap_base = sequence[i];
351 sequence[i] = get_complement(sequence[last_index - i]);
352 sequence[last_index - i] = get_complement(swap_base);
355 if(seq_len % 2) sequence[last_index / 2] = get_complement(sequence[last_index / 2]);
359 std::string reverse_complement(
const std::string& sequence)
361 std::string out(sequence);
362 reverse_complement_inplace(out);
368 std::string named_reverse_complement(
const std::string& sequence, std::string& name)
370 std::string out(sequence);
371 reverse_complement_inplace(out);
372 indicate_reversal(name);
377 void simple_solution_to_sequences(
const ScaffoldGraph& sg,
const SequenceMap& sequences, SequenceMap& out)
379 const RawScaffoldGraph& g = sg.get_graph();
380 boost::unordered_set<ScafVertex> to_check_deg1, to_check_deg2;
381 for(
auto range = boost::vertices(g); range.first != range.second; ++range.first)
382 if(sg.degree(*range.first) == 1)
383 to_check_deg1.insert(*range.first);
385 to_check_deg2.insert(*range.first);
388 while(!to_check_deg1.empty() && !to_check_deg2.empty()){
389 boost::unordered_set<ScafVertex>& to_check = to_check_deg1.empty() ? to_check_deg2 : to_check_deg1;
390 ScafVertex u = *(to_check.cbegin());
393 DEBUG5(std::cout <<
"writing down sequence starting at "<<sg[u].name<<
": "<<std::endl);
396 std::string current_name;
397 std::string current_seq;
400 const ScafVertex cycle_start = u;
403 const ScafEdge uv = sg.incident_matching_edge(u);
404 const ScafVertex v = boost::target(uv, g);
405 std::string contig_name = sg[uv].contig_name;
408 const OrientedSequence& os = sequences.at(contig_name);
409 DEBUG5(std::cout << contig_name <<
" (@"<<os.start_vertex<<
") ");
410 if(os.start_vertex == sg[u].name){
411 current_seq += os.sequence;
412 current_name += contig_name;
414 assert(os.start_vertex == sg[v].name);
415 current_seq += named_reverse_complement(os.sequence, contig_name);
416 current_name += contig_name;
420 to_check_deg2.erase(u);
423 const unsigned v_deg = sg.degree(v);
426 to_check_deg2.erase(v);
427 const ScafEdge vw = *(sg.get_incident_non_matching(v));
430 const int vw_length = sg[vw].length;
431 if(vw_length != NO_LENGTH){
432 assert(sg[vw].length >= 0);
433 current_seq += std::string(sg[vw].length,
'N');
436 u = boost::target(vw, g);
438 if(u == cycle_start) {
440 current_name += CYCLIC_SEQUENCE_INDICATOR;
442 }
else current_name +=
'+';
444 to_check_deg1.erase(v);
448 DEBUG5(std::cout << std::endl);
450 auto seq_it = out.find(current_name);
451 while(seq_it != out.cend()){
452 current_name +=
'\'';
453 seq_it = out.find(current_name);
456 out.emplace_hint(seq_it, current_name, current_seq);
Definition: graph_utils.hpp:18
Definition: read_adj_list.hpp:22
bool contains(const Set &s, const Element &el)
a more readable containment check
Definition: utils.hpp:171