Scaffolding  0.1
This program can assemble genome scaffolds using the pairing information in paired-end reads.
string_utils.hpp
Go to the documentation of this file.
1 
2 
7 #ifndef STRING_UTILS_HPP
8 #define STRING_UTILS_HPP
9 
10 #include <string>
11 
12 #include "utils/exceptions.hpp"
13 
14 #define WHITESPACES " \t"
15 
17 inline void skip_to(std::string& s, const std::string& skip_to)
18 {
19  const size_t pos = s.find_first_of(skip_to);
20  if(pos == std::string::npos) s.clear(); else s.erase(0,pos);
21 }
22 
24 inline void skip_all(std::string& s, const std::string& to_skip)
25 {
26  const size_t pos = s.find_first_not_of(to_skip);
27  if(pos == std::string::npos) s.clear(); else s.erase(0,pos);
28 }
29 
31 long read_single_number(std::string& s)
32 {
33  long result = std::stoi(s.c_str());
34  // remove result from the number
35  skip_all(s, "+-");
36  skip_all(s, "0123456789");
37  return result;
38 }
39 
41 std::string trim(const std::string& str, const std::string& to_remove = WHITESPACES)
42 {
43  const size_t first = str.find_first_not_of(to_remove);
44  if(first == std::string::npos) return "";
45  const size_t last = str.find_last_not_of(to_remove);
46  return str.substr(first, last - first + 1);
47 }
48 
50 
53 unsigned levenshtein_distance(const std::string &s1, const std::string &s2)
54 {
55  // To change the type this function manipulates and returns, change
56  // the return type and the types of the two variables below.
57  unsigned s1len = s1.size();
58  unsigned s2len = s2.size();
59  unsigned column_start = 1;
60  unsigned* column = new unsigned[s1len + 1];
61  std::iota(column + column_start, column + s1len + 1, column_start);
62 
63  for(unsigned x = column_start; x <= s2len; x++) {
64  column[0] = x;
65  unsigned last_diagonal = x - column_start;
66  for(unsigned y = column_start; y <= s1len; y++) {
67  unsigned old_diagonal = column[y];
68  auto possibilities = {
69  column[y] + 1,
70  column[y - 1] + 1,
71  last_diagonal + (s1[y - 1] == s2[x - 1]? 0 : 1)
72  };
73  column[y] = std::min(possibilities);
74  last_diagonal = old_diagonal;
75  }
76  }
77  unsigned result = column[s1len];
78  delete[] column;
79  return result;
80 }
81 
83 unsigned hamming_distance(const std::string& s1, const std::string& s2)
84 {
85  unsigned len = std::min(s1.length(), s2.length()) + 1;
86  unsigned result = 0;
87  while(len--) result += (s1[len] != s2[len]);
88  return result;
89 }
91 
93 unsigned hamming_distance(const char* s1, const char* s2, unsigned length)
94 {
95  unsigned result = 0;
96  while(length--) result += (s1[length] != s2[length]);
97  return result;
98 }
99 
101 
104 std::string merge_strings_segfault(const std::string& seq1,
105  const std::string& seq2,
106  const unsigned overlap)
107 {
108  assert(overlap <= seq1.length());
109  const unsigned seq1_len = seq1.length() - overlap;
110  const unsigned seq2_len = seq2.length();
111  const unsigned new_len = seq1_len + seq2_len;
112  char* const new_seq = (char*)malloc(new_len + 1);
113  char* const seq1_start = new_seq;
114  char* const seq2_start = new_seq + seq1_len;
115  std::cout << "got "<<new_len+1<<" bytes @"<<(long)new_seq<<std::endl;
116 
117  std::cout << "writing "<<(long)(seq2_start - seq1_start)<<" chars of "<<seq1<<" to "<<(long)seq1_start<<std::endl;
118  memcpy(seq1_start, seq1.c_str(), seq1_len);
119 
120  std::cout << "writing "<<seq2_len<<" chars of "<<seq2<<" to "<<(long)seq2_start<<std::endl;
121  memcpy(seq2_start, seq2.c_str(), seq2_len);
122  new_seq[new_len+1] = 0;
123 
124  std::string result(new_seq);
125  std::cout << "result string is "<< result <<" with c_str @"<<(long)(result.c_str())<<", freeing pointer @"<<(long)new_seq<<std::endl;
126  free(new_seq);
127  std::cout << "done!"<<std::endl;
128  return result;
129 }
130 
132 
134 std::string merge_strings(const std::string& seq1,
135  const std::string& seq2,
136  const unsigned overlap)
137 {
138 
139  std::string result = seq1.substr(0, seq1.length() - overlap) + seq2;
140  return result;
141 }
142 
143 
144 
145 #endif
std::string merge_strings_segfault(const std::string &seq1, const std::string &seq2, const unsigned overlap)
merge two sequences seq1 and seq2 with overlap "overlap" according to their order ...
Definition: string_utils.hpp:104
std::string merge_strings(const std::string &seq1, const std::string &seq2, const unsigned overlap)
merge two sequences seq1 and seq2 with overlap "overlap" according to their order ...
Definition: string_utils.hpp:134
unsigned levenshtein_distance(const std::string &s1, const std::string &s2)
return the levenstein distance between two strings
Definition: string_utils.hpp:53
void skip_all(std::string &s, const std::string &to_skip)
remove all characters in 'to_skip' from the beginning of s
Definition: string_utils.hpp:24
long read_single_number(std::string &s)
consume an integer from the beginning of s and return it
Definition: string_utils.hpp:31
void skip_to(std::string &s, const std::string &skip_to)
remove all characters not in 'skip_to' from the beginning of s
Definition: string_utils.hpp:17
std::string trim(const std::string &str, const std::string &to_remove=WHITESPACES)
remove leading & trailing chars (whitespaces by default) from str
Definition: string_utils.hpp:41
unsigned hamming_distance(const std::string &s1, const std::string &s2)
returns the Hamming distance between the maximal prefixes of equal length
Definition: string_utils.hpp:83