Skip to content

Commit 9624612

Browse files
committed
Change kmer_t into template
1 parent d3ea2c4 commit 9624612

27 files changed

Lines changed: 258 additions & 200 deletions

CMakeLists.txt

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,10 @@ set(Z_LIB_SOURCES
4646
include/gz/zip_stream.cpp
4747
)
4848

49-
set(SSHASH_SOURCES
50-
include/dictionary.cpp
51-
include/info.cpp
52-
include/dump.cpp
53-
include/statistics.cpp
54-
include/builder/build.cpp
55-
)
5649

5750
# Create a static lib
5851
add_library(sshash_static STATIC
5952
${Z_LIB_SOURCES}
60-
${SSHASH_SOURCES}
6153
)
6254

6355
add_executable(sshash src/sshash.cpp)

include/bit_vector_iterator.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
namespace sshash {
77

8+
template<class kmer_t>
89
struct bit_vector_iterator {
910
bit_vector_iterator() : m_bv(nullptr) {}
1011

include/buckets.hpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
namespace sshash {
88

9+
template<class kmer_t>
910
struct buckets {
1011
std::pair<lookup_result, uint64_t> offset_to_id(uint64_t offset, uint64_t k) const {
1112
auto [pos, contig_begin, contig_end] = pieces.locate(offset);
@@ -52,13 +53,13 @@ struct buckets {
5253

5354
kmer_t contig_prefix(uint64_t contig_id, uint64_t k) const {
5455
uint64_t contig_begin = pieces.access(contig_id);
55-
bit_vector_iterator bv_it(strings, 2 * contig_begin);
56+
bit_vector_iterator<kmer_t> bv_it(strings, 2 * contig_begin);
5657
return bv_it.read(2 * (k - 1));
5758
}
5859

5960
kmer_t contig_suffix(uint64_t contig_id, uint64_t k) const {
6061
uint64_t contig_end = pieces.access(contig_id + 1);
61-
bit_vector_iterator bv_it(strings, 2 * (contig_end - k + 1));
62+
bit_vector_iterator<kmer_t> bv_it(strings, 2 * (contig_end - k + 1));
6263
return bv_it.read(2 * (k - 1));
6364
}
6465

@@ -90,7 +91,7 @@ struct buckets {
9091
uint64_t m) const {
9192
uint64_t offset = offsets.access(super_kmer_id);
9293
auto [res, contig_end] = offset_to_id(offset, k);
93-
bit_vector_iterator bv_it(strings, 2 * offset);
94+
bit_vector_iterator<kmer_t> bv_it(strings, 2 * offset);
9495
uint64_t window_size = std::min<uint64_t>(k - m + 1, contig_end - offset - k + 1);
9596
for (uint64_t w = 0; w != window_size; ++w) {
9697
kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k);
@@ -115,7 +116,7 @@ struct buckets {
115116
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
116117
uint64_t offset = offsets.access(super_kmer_id);
117118
auto [res, contig_end] = offset_to_id(offset, k);
118-
bit_vector_iterator bv_it(strings, 2 * offset);
119+
bit_vector_iterator<kmer_t> bv_it(strings, 2 * offset);
119120
uint64_t window_size = std::min<uint64_t>(k - m + 1, contig_end - offset - k + 1);
120121
for (uint64_t w = 0; w != window_size; ++w) {
121122
kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k);
@@ -166,7 +167,7 @@ struct buckets {
166167

167168
void access(uint64_t kmer_id, char* string_kmer, uint64_t k) const {
168169
uint64_t offset = id_to_offset(kmer_id, k);
169-
bit_vector_iterator bv_it(strings, 2 * offset);
170+
bit_vector_iterator<kmer_t> bv_it(strings, 2 * offset);
170171
kmer_t read_kmer = bv_it.read(2 * k);
171172
util::uint_kmer_to_string(read_kmer, string_kmer, k);
172173
}
@@ -176,7 +177,7 @@ struct buckets {
176177

177178
iterator(buckets const* ptr, uint64_t kmer_id, uint64_t k, uint64_t num_kmers)
178179
: m_buckets(ptr), m_kmer_id(kmer_id), m_k(k), m_num_kmers(num_kmers) {
179-
bv_it = bit_vector_iterator(m_buckets->strings, -1);
180+
bv_it = bit_vector_iterator<kmer_t>(m_buckets->strings, -1);
180181
offset = m_buckets->id_to_offset(m_kmer_id, k);
181182
auto [pos, piece_end] = m_buckets->pieces.next_geq(offset);
182183
if (piece_end == offset) pos += 1;
@@ -219,7 +220,7 @@ struct buckets {
219220
uint64_t m_kmer_id, m_k, m_num_kmers;
220221
uint64_t offset;
221222
uint64_t next_offset;
222-
bit_vector_iterator bv_it;
223+
bit_vector_iterator<kmer_t> bv_it;
223224
ef_sequence<true>::iterator pieces_it;
224225

225226
kmer_t read_kmer;
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#include "../dictionary.hpp"
21
#include "../../external/pthash/external/essentials/include/essentials.hpp"
32
#include "util.hpp"
43

@@ -12,7 +11,8 @@
1211

1312
namespace sshash {
1413

15-
void dictionary::build(std::string const& filename, build_configuration const& build_config) {
14+
template<class kmer_t>
15+
void dictionary<kmer_t>::build(std::string const& filename, build_configuration const& build_config) {
1616
/* Validate the build configuration. */
1717
if (build_config.k == 0) throw std::runtime_error("k must be > 0");
1818
if (build_config.k > constants::max_k) {
@@ -41,7 +41,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b
4141

4242
/* step 1: parse the input file and build compact string pool ***/
4343
timer.start();
44-
parse_data data = parse_file(filename, build_config);
44+
parse_data<kmer_t> data = parse_file<kmer_t>(filename, build_config);
4545
m_size = data.num_kmers;
4646
timer.stop();
4747
timings.push_back(timer.elapsed());

include/builder/build_index.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,8 @@ struct bucket_pairs {
155155
}
156156
};
157157

158-
buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets,
158+
template<class kmer_t>
159+
buckets_statistics build_index(parse_data<kmer_t>& data, minimizers const& m_minimizers, buckets<kmer_t>& m_buckets,
159160
build_configuration const& build_config) {
160161
uint64_t num_buckets = m_minimizers.size();
161162
uint64_t num_kmers = data.num_kmers;

include/builder/build_skew_index.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
namespace sshash {
66

7-
void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets,
7+
template<class kmer_t>
8+
void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data, buckets<kmer_t> const& m_buckets,
89
build_configuration const& build_config,
910
buckets_statistics const& buckets_stats) {
1011
const uint64_t min_log2_size = m_skew_index.min_log2;
@@ -202,7 +203,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const&
202203
assert(lists[i].size() > lower and lists[i].size() <= upper);
203204
uint64_t super_kmer_id = 0;
204205
for (auto [offset, num_kmers_in_super_kmer] : lists[i]) {
205-
bit_vector_iterator bv_it(m_buckets.strings, 2 * offset);
206+
bit_vector_iterator<kmer_t> bv_it(m_buckets.strings, 2 * offset);
206207
for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) {
207208
kmer_t kmer = bv_it.read(2 * build_config.k);
208209
keys_in_partition.push_back(kmer);

include/builder/parse_file.hpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44

55
namespace sshash {
66

7+
template<class kmer_t>
78
struct parse_data {
89
parse_data(std::string const& tmp_dirname) : num_kmers(0), minimizers(tmp_dirname) {}
910
uint64_t num_kmers;
1011
minimizers_tuples minimizers;
11-
compact_string_pool strings;
12+
compact_string_pool<kmer_t> strings;
1213
weights::builder weights_builder;
1314
};
1415

15-
void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) {
16+
template<class kmer_t>
17+
void parse_file(std::istream& is, parse_data<kmer_t>& data, build_configuration const& build_config) {
1618
uint64_t k = build_config.k;
1719
uint64_t m = build_config.m;
1820
uint64_t seed = build_config.seed;
@@ -29,7 +31,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
2931
/* fit into the wanted number of bits */
3032
assert(max_num_kmers_in_super_kmer < (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8)));
3133

32-
compact_string_pool::builder builder(k);
34+
typename compact_string_pool<kmer_t>::builder builder(k);
3335

3436
std::string sequence;
3537
uint64_t prev_minimizer = constants::invalid_uint64;
@@ -161,12 +163,12 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
161163
while (end != sequence.size() - k + 1) {
162164
char const* kmer = sequence.data() + end;
163165
assert(util::is_valid(kmer, k));
164-
kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k);
165-
uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed);
166+
kmer_t uint_kmer = util::string_to_uint_kmer<kmer_t>(kmer, k);
167+
uint64_t minimizer = util::compute_minimizer<kmer_t>(uint_kmer, k, m, seed);
166168

167169
if (build_config.canonical_parsing) {
168-
kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, k);
169-
uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed);
170+
kmer_t uint_kmer_rc = util::compute_reverse_complement<kmer_t>(uint_kmer, k);
171+
uint64_t minimizer_rc = util::compute_minimizer<kmer_t>(uint_kmer_rc, k, m, seed);
170172
minimizer = std::min<uint64_t>(minimizer, minimizer_rc);
171173
}
172174

@@ -205,16 +207,17 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
205207
}
206208
}
207209

208-
parse_data parse_file(std::string const& filename, build_configuration const& build_config) {
210+
template<class kmer_t>
211+
parse_data<kmer_t> parse_file(std::string const& filename, build_configuration const& build_config) {
209212
std::ifstream is(filename.c_str());
210213
if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'");
211214
std::cout << "reading file '" << filename << "'..." << std::endl;
212-
parse_data data(build_config.tmp_dirname);
215+
parse_data<kmer_t> data(build_config.tmp_dirname);
213216
if (util::ends_with(filename, ".gz")) {
214217
zip_istream zis(is);
215-
parse_file(zis, data, build_config);
218+
parse_file<kmer_t>(zis, data, build_config);
216219
} else {
217-
parse_file(is, data, build_config);
220+
parse_file<kmer_t>(is, data, build_config);
218221
}
219222
is.close();
220223
return data;

include/builder/util.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct parse_runtime_error : public std::runtime_error {
2626
}
2727
}
2828

29+
template<class kmer_t>
2930
struct compact_string_pool {
3031
compact_string_pool() {}
3132

@@ -49,7 +50,7 @@ struct compact_string_pool {
4950
pieces.push_back(bvb_strings.size() / 2);
5051
}
5152
for (uint64_t i = prefix; i != size; ++i) {
52-
bvb_strings.append_bits(util::char_to_uint(string[i]), 2);
53+
bvb_strings.append_bits(util::char_to_uint<kmer_t>(string[i]), 2);
5354
}
5455
num_super_kmers += 1;
5556
offset = bvb_strings.size() / 2;

include/constants.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44

55
namespace sshash::constants {
66

7-
constexpr uint64_t uint_kmer_bits = sizeof(kmer_t) * 8;
7+
constexpr uint64_t uint_kmer_bits = sizeof(default_kmer_t) * 8;
88

99
/* max *odd* size that can be packed into sizeof(kmer_t)*8 bits */
10-
constexpr uint64_t max_k = sizeof(kmer_t) * 4 - 1;
10+
constexpr uint64_t max_k = sizeof(default_kmer_t) * 4 - 1;
1111

1212
/* max *odd* size that can be packed into 64 bits */
1313
constexpr uint64_t max_m = 31;

include/dictionary.hpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@
88

99
namespace sshash {
1010

11+
// Forward declarations of the friend template classes
12+
template<class kmer_t>
13+
struct streaming_query_canonical_parsing;
14+
15+
template<class kmer_t>
16+
struct streaming_query_regular_parsing;
17+
18+
template<class kmer_t = default_kmer_t>
1119
struct dictionary {
1220
dictionary() : m_size(0), m_seed(0), m_k(0), m_m(0), m_canonical_parsing(0) {}
1321

@@ -61,8 +69,8 @@ struct dictionary {
6169
bool is_member_uint(kmer_t uint_kmer, bool check_reverse_complement = true) const;
6270

6371
/* Streaming queries. */
64-
friend struct streaming_query_canonical_parsing;
65-
friend struct streaming_query_regular_parsing;
72+
friend struct streaming_query_canonical_parsing<kmer_t>;
73+
friend struct streaming_query_regular_parsing<kmer_t>;
6674
streaming_query_report streaming_query_from_file(std::string const& filename,
6775
bool multiline) const;
6876

@@ -75,7 +83,7 @@ struct dictionary {
7583
std::pair<uint64_t, std::string> next() { return it.next(); }
7684

7785
private:
78-
typename buckets::iterator it;
86+
typename buckets<kmer_t>::iterator it;
7987
};
8088

8189
iterator begin() const { return iterator(this); }
@@ -112,8 +120,8 @@ struct dictionary {
112120
uint16_t m_m;
113121
uint16_t m_canonical_parsing;
114122
minimizers m_minimizers;
115-
buckets m_buckets;
116-
skew_index m_skew_index;
123+
buckets<kmer_t> m_buckets;
124+
skew_index<kmer_t> m_skew_index;
117125
weights m_weights;
118126

119127
lookup_result lookup_uint_regular_parsing(kmer_t uint_kmer) const;
@@ -123,3 +131,9 @@ struct dictionary {
123131
};
124132

125133
} // namespace sshash
134+
135+
#include "builder/build.impl"
136+
#include "dictionary.impl"
137+
#include "dump.impl"
138+
#include "info.impl"
139+
#include "statistics.impl"

0 commit comments

Comments
 (0)