Scaffolding  0.1
This program can assemble genome scaffolds using the pairing information in paired-end reads.
negative_length.hpp
1 
2 #ifndef NEGATIVE_LENGTH_HPP
3 #define NEGATIVE_LENGTH_HPP
4 
5 #include <cmath>
6 #include "utils/string_utils.hpp"
8 #include "utils/scaffolding_typedefs.hpp"
9 
10 // the maximum distance to the overlap "middle" when overlapping contigs
11 #define MAX_OVERLAP_RADIUS (unsigned)200
12 
13 // percentage of theoretical maximum score under which alignments are not considered meaningful
14 #define SCORE_THRESHOLD_PERCENTAGE 5.0
15 
16 // cumulative probability under which non-contig length are considered insane
17 #define MINIMUM_SANE_CUMULATIVE_PROBABILITY 0.001
18 
19 
20 namespace scaffold{
21 
22  // handle non-contig edges of negative length:
23  // 1. try to merge the contigs on both ends of the edge:
24  // for each (sane) alignment:
25  // compute the levenstein distance and penalize by the shift-difference to the non-contig length
26  // NOTE that shifting one contig |length| base-pairs gives obviously a perfect alignment, so only check 2|length| times
27  // 2. allow some modification of the contig length
28 
29  // an alignment has an overlap and a score
30  typedef std::pair<int, float> Alignment;
31 
32 
33 #warning TODO: find a good alignment score! For now, we are using an adaptation of FLASH (http://ccb.jhu.edu/software/FLASH/): MAXIMIZE (inverse average Hamming distance) * (probability of the overlap (assuming gaussian distribution around the mean overlap))
34  // define when an alignment is better than another alignment
35  // returns true iff "better" is better than "worse"
36  bool is_better(const Alignment& better, const Alignment& worse)
37  {
38  return better.second > worse.second;
39  }
40 
41  // compute the score of an overlap, given the shift, the length of the overlap and the hamming distance
42  float compute_score(const int shift, const float length, const float ham_dist, const normal_distribution& distribution)
43  {
44  const float inverse_avg_ham_dist = length / (ham_dist + 0.1); // +0.1 to avoid division by 0
45  return distribution.probability(shift) * inverse_avg_ham_dist;
46  }
47 
48  // get the best alignment of the two strings c1 & c2 that are supposed to overlap near "overlap":
49  // ====c1======
50  // =====c2====
51  // --> overlap = 3
52  // we try shifting c2 by -overlap...(overlap-1) and return the best scoring Alignment
53  Alignment get_best_overlap(const std::string& c1,
54  const std::string& c2,
55  const unsigned overlap,
56  const normal_distribution& distribution)
57  {
58  const unsigned radius = std::min(overlap, MAX_OVERLAP_RADIUS);
59  const unsigned c1_len = c1.length();
60  const unsigned c2_len = c2.length();
61  const unsigned center_index = (c1_len > overlap) ? c1_len - overlap : 0;
62  // how far to deviate from the center index
63  const unsigned lower_index = (center_index > radius) ? center_index - radius : 0;
64  const unsigned upper_index = center_index + radius;
65 
66  // compute scores for each index of c1 where to put c2
67  Alignment best(0, 0);
68  for(unsigned index = lower_index; index <= upper_index; ++index){
69  const unsigned length = std::min(c1_len - index, c2_len);
70  const int shift = (int)index - (int)center_index;
71  // get the score
72  const unsigned ham_dist = hamming_distance(c1.c_str() + index, c2.c_str(), length);
73  // update the best scoring alignment
74  Alignment tmp(c1_len - index, compute_score(shift, length, ham_dist, distribution));
75 /* DEBUG5(
76  std::cout << "shift-"<<shift<<" alignment:"<<std::endl;
77  for(unsigned i = 0; i < index; ++i) std::cout << "_"; std::cout << c2<<std::endl;
78  std::cout <<c1<<std::endl;
79  std::cout << "has average hamming distance "<<((float)ham_dist)/(length+0.0001)<<" and thus scores "<<tmp.second<<std::endl;
80  );*/
81  if(is_better(tmp, best)) best = tmp;
82  }
83  return best;
84  }
85 
86 
87  // contract a non-matching edge uv, merging the contigs xu & vy & return the merged contig edge
88  const ScafEdge contract_non_contig(ScaffoldGraph& sg,
89  const ScafVertex& x,
90  const ScafVertex& u,
91  const ScafVertex& v,
92  const ScafVertex& y,
93  const int total_length,
94  const unsigned uv_multi)
95  {
96 #warning TODO: care for the case that xu & vy are merged and xy is already a non-contig edge!!!
97  assert(!sg.adjacent(x, y));
98 
99  sg.delete_vertex(u, false);
100  sg.delete_vertex(v, true);
101  auto add_edge_result = sg.add_matching_edge(x, y, ScafEdgeProperty(NO_WEIGHT, total_length, uv_multi));
102  assert(add_edge_result.second);
103  return add_edge_result.first;
104  }
105 
106 
107  // merge two oriented sequences seq1 & seq2 AFTER reorienting them according to outer1 & outer2
108  // NOTE: outer1 has to be an endpoint of the contig corresponding to named_seq1 (and the same for 2)
109  // such that the result is FROM outer1 TO outer2
110  void reorient_and_merge(const std::pair<std::string, OrientedSequence>& named_seq1,
111  const std::pair<std::string, OrientedSequence>& named_seq2,
112  const VertexName& outer1,
113  const VertexName& outer2,
114  const unsigned overlap,
115  std::string& new_seq,
116  std::string& new_seq_name)
117  {
118  // step 1: add the name & (reversed) sequence of named_seq1
119  OrientedSequence seq1(named_seq1.second);
120  if(seq1.start_vertex != outer1) {
121  seq1.reverse_complement(outer1);
122  new_seq_name = get_reversed_name(named_seq1.first);
123  } else new_seq_name = named_seq1.first;
124 
125  new_seq_name += "+";
126 
127  // step 1: add the name & (reversed) sequence of named_seq2
128  OrientedSequence seq2(named_seq2.second);
129  if(seq2.start_vertex == outer2) {
130  seq2.reverse_complement();
131  new_seq_name += get_reversed_name(named_seq2.first);
132  } else new_seq_name += named_seq2.first;
133  DEBUG5(std::cout << "merging seqeunces "<<named_seq1.first<<" (len "<< named_seq1.second.sequence.length() <<") & "<< named_seq2.first<< " (len "<< named_seq2.second.sequence.length()<<") with overlap "<<overlap<<std::endl;)
134 
135  new_seq = merge_strings(seq1.sequence, seq2.sequence, overlap);
136 
137  DEBUG5(std::cout << "merged successfully to "<<new_seq_name<<" (len "<< new_seq.length()<<")"<<std::endl;)
138  }
139 
140  // fix negative non-contig length by either merging the contigs or resetting the length to 0
141  // the best contig overlap is determined by get_best_overlap
142  void fix_negative_non_contig_lengths(ScaffoldGraph& sg,
143  SequenceMap& sequences)
144  {
145  const RawScaffoldGraph& g = sg.get_graph();
146  const unsigned standard_deviation = sg.get_graph_property().standard_deviation;
147  const normal_distribution dist(standard_deviation);
148 
149  // we keep a count indicating if a sequence is still needed, so we can delete it if necessary
150  boost::unordered_map<std::string, unsigned> use_count;
151  for(auto e_it = sg.get_matching_edges(); e_it.is_valid(); ++e_it){
152  const std::string& cname = sg[*e_it].contig_name;
153  const auto used_it = use_count.DEEP_EMPLACE(cname, 1);
154  // if no insertion took place, increase use count instead
155  if(!used_it.second) ++(used_it.first->second);
156  }
157 
158  for(auto e_it = sg.get_non_matching_edges(); e_it.is_valid();){
159  const ScafEdge& e = *e_it;
160  int& e_length = sg[e].length;
161  if(e_length < 0){
162  DEBUG3(std::cout << "found edge "<<sg.get_edge_name(e)<<" with length "<<e_length<<std::endl);
163  // step 1: get the two incident contigs
164  const ScafVertex u = boost::source(e, g);
165  const ScafVertex v = boost::target(e, g);
166  const ScafEdge xu = sg.incident_matching_edge(u);
167  const ScafEdge vy = sg.incident_matching_edge(v);
168  const int xu_length = sg[xu].length;
169  const int vy_length = sg[vy].length;
170  const int xuvy_length = xu_length + vy_length + e_length;
171  DEBUG5(std::cout << "xu+uv+vy has length "<<xuvy_length<<std::endl);
172 
173  // only make changes if the total length afterwards is > 0
174  if(xuvy_length > 0){
175  DEBUG5(std::cout << sg.get_edge_name(xu) << " is linked to contig '"<<sg[xu].contig_name<<"'"<<std::endl);
176  // step 2: now, we can read u->v or v->u, so try the two
177  const std::string& xu_cname = sg[xu].contig_name;
178  auto u_contig_it = sequences.find(xu_cname);
179  const OrientedSequence& u_contig = u_contig_it->second;
180  DEBUG5(std::cout << "start vertex of "<<sg[xu].contig_name<<" is "<<u_contig.start_vertex<<std::endl);
181 
182  const std::string& vy_cname = sg[vy].contig_name;
183  auto v_contig_it = sequences.find(vy_cname);
184  const OrientedSequence& v_contig = v_contig_it->second;
185  DEBUG5(std::cout << "start vertex of "<<v_contig_it->first<<" is "<<v_contig.start_vertex<<std::endl);
186 
187  const std::string xu_seq( (u_contig.start_vertex == sg[u].name) ? reverse_complement(u_contig.sequence) : u_contig.sequence);
188  const std::string vy_seq( (v_contig.start_vertex == sg[v].name) ? v_contig.sequence : reverse_complement(v_contig.sequence));
189  const Alignment best = get_best_overlap(xu_seq, vy_seq, -e_length, dist);
190 
191  // step 3: according to the best alignment, merge the two contigs or set the non-contig length to 0
192  const float theoretical_max_score = compute_score(0, -e_length, 0, dist);
193  const Alignment threshold(0, (float)theoretical_max_score * SCORE_THRESHOLD_PERCENTAGE / 100.0);
194 
195  DEBUG5(std::cout << "best alignment of "<<sg.get_edge_name(xu)<<" & "<<sg.get_edge_name(vy)<<": "<<best.first<<"bp before the end of the first sequence scoring "<<best.second<<" vs. threshold "<<threshold.second<<" ("<<SCORE_THRESHOLD_PERCENTAGE<<"% of theoretical max "<<theoretical_max_score<<")"<<std::endl);
196 
197  if(is_better(best, threshold)){
198  // if we have a convincing alignment, merge the two contigs using the alignments shift value
199  // first, save our iterator
200  while( (e_it.is_valid()) && ((boost::source(*e_it, g) == u) || (boost::target(*e_it, g) == u)
201  || (boost::source(*e_it, g) == v) || (boost::target(*e_it, g) == v)) ) ++e_it;
202  // then, merge the sequences
203  std::string new_seq, new_seq_name;
204  const ScafVertex x = sg.matched_with(u);
205  const ScafVertex y = sg.matched_with(v);
206  // if xu->vy provides the best overlap, construct a new sequence from x to y, otherwise, from y to x
207  const VertexName& new_start = sg[x].name;
208  const VertexName& new_end = sg[y].name;
209  reorient_and_merge(*u_contig_it, *v_contig_it, new_start, new_end, best.first, new_seq, new_seq_name);
210 
211  unsigned& xu_cname_count = use_count[xu_cname];
212  unsigned& vy_cname_count = use_count[vy_cname];
213  if(xu_cname_count == 1) sequences.erase(u_contig_it); else --xu_cname_count;
214  if(vy_cname_count == 1) sequences.erase(v_contig_it); else --vy_cname_count;
215 
216  // merge the contigs
217  const ScafEdge new_e = contract_non_contig(sg, x, u, v, y, xuvy_length, sg[e].multiplicity);
218  DEBUG5(std::cout << "new edge "<<sg.get_edge_name(new_e)<<" ("<<new_start<<"->"<<new_end<<") has sequence '"<<new_seq_name<<"' starting at "<<new_start<<": "<<new_seq<<std::endl);
219  // and assign the new sequence to it
220  sg[new_e].contig_name = new_seq_name;
221 
222  const bool emplaced = sequences.emplace(std::piecewise_construct, std::make_tuple(new_seq_name), std::make_tuple(new_seq, new_start)).second;
223  if(emplaced) use_count.DEEP_EMPLACE(new_seq_name, 1); else use_count[new_seq_name]++;
224  } else {
225  // if the threshold is better than the best score,
226  // then no alignment is convincing, so force the length of e to 0
227  sg[e].length = 0;
228  ++e_it;
229  }
230  } else {
231  // if xuvy has negative total length, then consider the length of e erroneous
232  e_length = NO_LENGTH;
233  ++e_it;
234  }
235 
236  } else ++e_it;
237  }// for all non-matching edges of sg
238  }
239 
240 
241  // treat negative non-contig lengths if we don't have sequences:
242  // just merge the contigs and modify the length (unless this length would be negative, in which case do nothing)
243  void fix_negative_non_contig_lengths_without_sequences(ScaffoldGraph& sg)
244  {
245  const RawScaffoldGraph& g = sg.get_graph();
246  for(auto e_it = sg.get_non_matching_edges(); e_it.is_valid(); ++e_it){
247  const ScafEdge& e = *e_it;
248  int& e_length = sg[e].length;
249  if(e_length < 0){
250  // step 1: get the two incident contigs
251  const ScafVertex u = boost::source(e, g);
252  const ScafVertex v = boost::target(e, g);
253  const ScafEdge ux = sg.incident_matching_edge(u);
254  const ScafEdge vy = sg.incident_matching_edge(v);
255  const ScafVertex x = boost::target(ux, g);
256  const ScafVertex y = boost::target(vy, g);
257  const int xu_length = sg[ux].length;
258  const int vy_length = sg[vy].length;
259  const int xuvy_length = xu_length + vy_length + e_length;
260 
261  // only merge contigs if the total length will be positive
262  if(xuvy_length > 0){
263  contract_non_contig(sg, x, u, v, y, xuvy_length, sg[e].multiplicity);
264  } else {
265  // otherwise consider the length of e erroneous
266  e_length = NO_LENGTH;
267  }
268 
269  }// if e has negative length
270  }// for all non-matching edges of sg
271  }
272 
273  // treat insane non-contig distances
274  // a non-contig length is insane if
275  // (a) it is negative and its absolute value is bigger than one of the incident contig lengths
276  // (b) it is positive and the probability of picking a value at least this length is below a given threshold
277  void fix_implausible_lengths(ScaffoldGraph& sg)
278  {
279  const ScafGraphProperty& gp = sg.get_graph_property();
280  const RawScaffoldGraph& g = sg.get_graph();
281  for(auto e_it = sg.get_non_matching_edges(); e_it.is_valid(); ++e_it){
282  const ScafEdge& e = *e_it;
283  int& e_length = sg[e].length;
284  if(e_length < 0){
285  const ScafVertex u = boost::source(e, g);
286  const ScafVertex v = boost::target(e, g);
287  const ScafEdge ux = sg.incident_matching_edge(u);
288  const ScafEdge vy = sg.incident_matching_edge(v);
289  const int xu_length = sg[ux].length;
290  const int vy_length = sg[vy].length;
291 #warning TODO: should we rather say a contig length is invalid if THE SUM of the two incident contig lengths is too small instead of the MIN? Then, we would have to try to overlap them...
292  if((-e_length >= xu_length) || (-e_length >= vy_length)){
293  DEBUG5(std::cout << sg.get_edge_name(e) << " has implausible length "<<e_length<<" (adjacent contigs have lengths "<<xu_length<<" & "<<vy_length<<")"<<std::endl);
294  e_length = NO_LENGTH;
295  }
296  } else {
297  const normal_distribution length_distribution(gp.standard_deviation, gp.insert_size);
298  const float prob = length_distribution.prob_of_picking_at_least(e_length);
299  if(prob < MINIMUM_SANE_CUMULATIVE_PROBABILITY){
300  e_length = NO_LENGTH;
301  DEBUG5(std::cout << sg.get_edge_name(e) << " has implausible length "<<e_length<<" (probability "<<prob<<")"<<std::endl);
302  }
303  }
304  }
305  }
306 }
307 
308 
309 #endif
310 
std::string merge_strings(const std::string &seq1, const std::string &seq2, const unsigned overlap)
merge two sequences seq1 and seq2 with overlap "overlap" according to their order ...
Definition: string_utils.hpp:134
Definition: read_adj_list.hpp:22
unsigned hamming_distance(const std::string &s1, const std::string &s2)
returns the Hamming distance between the maximal prefixes of equal length
Definition: string_utils.hpp:83