2 #ifndef NEGATIVE_LENGTH_HPP
3 #define NEGATIVE_LENGTH_HPP
8 #include "utils/scaffolding_typedefs.hpp"
11 #define MAX_OVERLAP_RADIUS (unsigned)200
14 #define SCORE_THRESHOLD_PERCENTAGE 5.0
17 #define MINIMUM_SANE_CUMULATIVE_PROBABILITY 0.001
30 typedef std::pair<int, float> Alignment;
33 #warning TODO: find a good alignment score! For now, we are using an adaptation of FLASH (http://ccb.jhu.edu/software/FLASH/): MAXIMIZE (inverse average Hamming distance) * (probability of the overlap (assuming gaussian distribution around the mean overlap))
36 bool is_better(
const Alignment& better,
const Alignment& worse)
38 return better.second > worse.second;
42 float compute_score(
const int shift,
const float length,
const float ham_dist,
const normal_distribution& distribution)
44 const float inverse_avg_ham_dist = length / (ham_dist + 0.1);
45 return distribution.probability(shift) * inverse_avg_ham_dist;
53 Alignment get_best_overlap(
const std::string& c1,
54 const std::string& c2,
55 const unsigned overlap,
56 const normal_distribution& distribution)
58 const unsigned radius = std::min(overlap, MAX_OVERLAP_RADIUS);
59 const unsigned c1_len = c1.length();
60 const unsigned c2_len = c2.length();
61 const unsigned center_index = (c1_len > overlap) ? c1_len - overlap : 0;
63 const unsigned lower_index = (center_index > radius) ? center_index - radius : 0;
64 const unsigned upper_index = center_index + radius;
68 for(
unsigned index = lower_index; index <= upper_index; ++index){
69 const unsigned length = std::min(c1_len - index, c2_len);
70 const int shift = (int)index - (
int)center_index;
72 const unsigned ham_dist =
hamming_distance(c1.c_str() + index, c2.c_str(), length);
74 Alignment tmp(c1_len - index, compute_score(shift, length, ham_dist, distribution));
81 if(is_better(tmp, best)) best = tmp;
88 const ScafEdge contract_non_contig(ScaffoldGraph& sg,
93 const int total_length,
94 const unsigned uv_multi)
96 #warning TODO: care for the case that xu & vy are merged and xy is already a non-contig edge!!!
97 assert(!sg.adjacent(x, y));
99 sg.delete_vertex(u,
false);
100 sg.delete_vertex(v,
true);
101 auto add_edge_result = sg.add_matching_edge(x, y, ScafEdgeProperty(NO_WEIGHT, total_length, uv_multi));
102 assert(add_edge_result.second);
103 return add_edge_result.first;
110 void reorient_and_merge(
const std::pair<std::string, OrientedSequence>& named_seq1,
111 const std::pair<std::string, OrientedSequence>& named_seq2,
112 const VertexName& outer1,
113 const VertexName& outer2,
114 const unsigned overlap,
115 std::string& new_seq,
116 std::string& new_seq_name)
119 OrientedSequence seq1(named_seq1.second);
120 if(seq1.start_vertex != outer1) {
121 seq1.reverse_complement(outer1);
122 new_seq_name = get_reversed_name(named_seq1.first);
123 }
else new_seq_name = named_seq1.first;
128 OrientedSequence seq2(named_seq2.second);
129 if(seq2.start_vertex == outer2) {
130 seq2.reverse_complement();
131 new_seq_name += get_reversed_name(named_seq2.first);
132 }
else new_seq_name += named_seq2.first;
133 DEBUG5(std::cout <<
"merging seqeunces "<<named_seq1.first<<
" (len "<< named_seq1.second.sequence.length() <<
") & "<< named_seq2.first<<
" (len "<< named_seq2.second.sequence.length()<<
") with overlap "<<overlap<<std::endl;)
135 new_seq =
merge_strings(seq1.sequence, seq2.sequence, overlap);
137 DEBUG5(std::cout <<
"merged successfully to "<<new_seq_name<<
" (len "<< new_seq.length()<<
")"<<std::endl;)
142 void fix_negative_non_contig_lengths(ScaffoldGraph& sg,
143 SequenceMap& sequences)
145 const RawScaffoldGraph& g = sg.get_graph();
146 const unsigned standard_deviation = sg.get_graph_property().standard_deviation;
147 const normal_distribution dist(standard_deviation);
150 boost::unordered_map<std::string, unsigned> use_count;
151 for(
auto e_it = sg.get_matching_edges(); e_it.is_valid(); ++e_it){
152 const std::string& cname = sg[*e_it].contig_name;
153 const auto used_it = use_count.DEEP_EMPLACE(cname, 1);
155 if(!used_it.second) ++(used_it.first->second);
158 for(
auto e_it = sg.get_non_matching_edges(); e_it.is_valid();){
159 const ScafEdge& e = *e_it;
160 int& e_length = sg[e].length;
162 DEBUG3(std::cout <<
"found edge "<<sg.get_edge_name(e)<<
" with length "<<e_length<<std::endl);
164 const ScafVertex u = boost::source(e, g);
165 const ScafVertex v = boost::target(e, g);
166 const ScafEdge xu = sg.incident_matching_edge(u);
167 const ScafEdge vy = sg.incident_matching_edge(v);
168 const int xu_length = sg[xu].length;
169 const int vy_length = sg[vy].length;
170 const int xuvy_length = xu_length + vy_length + e_length;
171 DEBUG5(std::cout <<
"xu+uv+vy has length "<<xuvy_length<<std::endl);
175 DEBUG5(std::cout << sg.get_edge_name(xu) <<
" is linked to contig '"<<sg[xu].contig_name<<
"'"<<std::endl);
177 const std::string& xu_cname = sg[xu].contig_name;
178 auto u_contig_it = sequences.find(xu_cname);
179 const OrientedSequence& u_contig = u_contig_it->second;
180 DEBUG5(std::cout <<
"start vertex of "<<sg[xu].contig_name<<
" is "<<u_contig.start_vertex<<std::endl);
182 const std::string& vy_cname = sg[vy].contig_name;
183 auto v_contig_it = sequences.find(vy_cname);
184 const OrientedSequence& v_contig = v_contig_it->second;
185 DEBUG5(std::cout <<
"start vertex of "<<v_contig_it->first<<
" is "<<v_contig.start_vertex<<std::endl);
187 const std::string xu_seq( (u_contig.start_vertex == sg[u].name) ? reverse_complement(u_contig.sequence) : u_contig.sequence);
188 const std::string vy_seq( (v_contig.start_vertex == sg[v].name) ? v_contig.sequence : reverse_complement(v_contig.sequence));
189 const Alignment best = get_best_overlap(xu_seq, vy_seq, -e_length, dist);
192 const float theoretical_max_score = compute_score(0, -e_length, 0, dist);
193 const Alignment threshold(0, (
float)theoretical_max_score * SCORE_THRESHOLD_PERCENTAGE / 100.0);
195 DEBUG5(std::cout <<
"best alignment of "<<sg.get_edge_name(xu)<<
" & "<<sg.get_edge_name(vy)<<
": "<<best.first<<
"bp before the end of the first sequence scoring "<<best.second<<
" vs. threshold "<<threshold.second<<
" ("<<SCORE_THRESHOLD_PERCENTAGE<<
"% of theoretical max "<<theoretical_max_score<<
")"<<std::endl);
197 if(is_better(best, threshold)){
200 while( (e_it.is_valid()) && ((boost::source(*e_it, g) == u) || (boost::target(*e_it, g) == u)
201 || (boost::source(*e_it, g) == v) || (boost::target(*e_it, g) == v)) ) ++e_it;
203 std::string new_seq, new_seq_name;
204 const ScafVertex x = sg.matched_with(u);
205 const ScafVertex y = sg.matched_with(v);
207 const VertexName& new_start = sg[x].name;
208 const VertexName& new_end = sg[y].name;
209 reorient_and_merge(*u_contig_it, *v_contig_it, new_start, new_end, best.first, new_seq, new_seq_name);
211 unsigned& xu_cname_count = use_count[xu_cname];
212 unsigned& vy_cname_count = use_count[vy_cname];
213 if(xu_cname_count == 1) sequences.erase(u_contig_it);
else --xu_cname_count;
214 if(vy_cname_count == 1) sequences.erase(v_contig_it);
else --vy_cname_count;
217 const ScafEdge new_e = contract_non_contig(sg, x, u, v, y, xuvy_length, sg[e].multiplicity);
218 DEBUG5(std::cout <<
"new edge "<<sg.get_edge_name(new_e)<<
" ("<<new_start<<
"->"<<new_end<<
") has sequence '"<<new_seq_name<<
"' starting at "<<new_start<<
": "<<new_seq<<std::endl);
220 sg[new_e].contig_name = new_seq_name;
222 const bool emplaced = sequences.emplace(std::piecewise_construct, std::make_tuple(new_seq_name), std::make_tuple(new_seq, new_start)).second;
223 if(emplaced) use_count.DEEP_EMPLACE(new_seq_name, 1);
else use_count[new_seq_name]++;
232 e_length = NO_LENGTH;
243 void fix_negative_non_contig_lengths_without_sequences(ScaffoldGraph& sg)
245 const RawScaffoldGraph& g = sg.get_graph();
246 for(
auto e_it = sg.get_non_matching_edges(); e_it.is_valid(); ++e_it){
247 const ScafEdge& e = *e_it;
248 int& e_length = sg[e].length;
251 const ScafVertex u = boost::source(e, g);
252 const ScafVertex v = boost::target(e, g);
253 const ScafEdge ux = sg.incident_matching_edge(u);
254 const ScafEdge vy = sg.incident_matching_edge(v);
255 const ScafVertex x = boost::target(ux, g);
256 const ScafVertex y = boost::target(vy, g);
257 const int xu_length = sg[ux].length;
258 const int vy_length = sg[vy].length;
259 const int xuvy_length = xu_length + vy_length + e_length;
263 contract_non_contig(sg, x, u, v, y, xuvy_length, sg[e].multiplicity);
266 e_length = NO_LENGTH;
277 void fix_implausible_lengths(ScaffoldGraph& sg)
279 const ScafGraphProperty& gp = sg.get_graph_property();
280 const RawScaffoldGraph& g = sg.get_graph();
281 for(
auto e_it = sg.get_non_matching_edges(); e_it.is_valid(); ++e_it){
282 const ScafEdge& e = *e_it;
283 int& e_length = sg[e].length;
285 const ScafVertex u = boost::source(e, g);
286 const ScafVertex v = boost::target(e, g);
287 const ScafEdge ux = sg.incident_matching_edge(u);
288 const ScafEdge vy = sg.incident_matching_edge(v);
289 const int xu_length = sg[ux].length;
290 const int vy_length = sg[vy].length;
291 #warning TODO: should we rather say a contig length is invalid if THE SUM of the two incident contig lengths is too small instead of the MIN? Then, we would have to try to overlap them...
292 if((-e_length >= xu_length) || (-e_length >= vy_length)){
293 DEBUG5(std::cout << sg.get_edge_name(e) <<
" has implausible length "<<e_length<<
" (adjacent contigs have lengths "<<xu_length<<
" & "<<vy_length<<
")"<<std::endl);
294 e_length = NO_LENGTH;
297 const normal_distribution length_distribution(gp.standard_deviation, gp.insert_size);
298 const float prob = length_distribution.prob_of_picking_at_least(e_length);
299 if(prob < MINIMUM_SANE_CUMULATIVE_PROBABILITY){
300 e_length = NO_LENGTH;
301 DEBUG5(std::cout << sg.get_edge_name(e) <<
" has implausible length "<<e_length<<
" (probability "<<prob<<
")"<<std::endl);
std::string merge_strings(const std::string &seq1, const std::string &seq2, const unsigned overlap)
merge two sequences seq1 and seq2 with overlap "overlap" according to their order ...
Definition: string_utils.hpp:134
Definition: read_adj_list.hpp:22
unsigned hamming_distance(const std::string &s1, const std::string &s2)
returns the Hamming distance between the maximal prefixes of equal length
Definition: string_utils.hpp:83