Raptor 3.0.0-rc.1
A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences
 
hierarchical_interleaved_bloom_filter.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <ranges>
16
17#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
18
19#ifndef RAPTOR_HIBF_HAS_COUNT
20# define RAPTOR_HIBF_HAS_COUNT 0
21#endif
22
23namespace raptor
24{
25
86template <seqan3::data_layout data_layout_mode_ = seqan3::data_layout::uncompressed>
88{
89public:
90 // Forward declaration
91 class user_bins;
92
93 // Forward declaration
94 class membership_agent;
95
96#if RAPTOR_HIBF_HAS_COUNT
97 // Forward declaration
98 template <std::integral value_t>
100#endif
101
103 static constexpr seqan3::data_layout data_layout_mode = data_layout_mode_;
104
106 using ibf_t = seqan3::interleaved_bloom_filter<data_layout_mode_>;
107
119
121
124
133
136
139 {
141 }
142
143#if RAPTOR_HIBF_HAS_COUNT
147 template <std::integral value_t = uint16_t>
149 {
150 return counting_agent_type<value_t>{*this};
151 }
152#endif
153
162 template <seqan3::cereal_archive archive_t>
163 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
164 {
165 archive(ibf_vector);
166 archive(next_ibf_id);
167 archive(user_bins);
168 }
170};
171
174template <seqan3::data_layout data_layout_mode>
176{
177private:
180
187 std::vector<std::vector<int64_t>> ibf_bin_to_filename_position{};
188
189public:
191 size_t num_user_bins() const noexcept
192 {
193 return user_bin_filenames.size();
194 }
195
197 void set_ibf_count(size_t const size)
198 {
199 ibf_bin_to_filename_position.resize(size);
200 }
201
203 void set_user_bin_count(size_t const size)
204 {
205 user_bin_filenames.resize(size);
206 }
207
218 {
219 return ibf_bin_to_filename_position[idx];
220 }
221
232 {
233 return user_bin_filenames[idx];
234 }
235
237 std::string const & operator[](std::pair<size_t, size_t> const & index_pair) const
238 {
239 return user_bin_filenames[ibf_bin_to_filename_position[index_pair.first][index_pair.second]];
240 }
241
245 auto operator[](size_t const ibf_idx) const
246 {
247 return ibf_bin_to_filename_position[ibf_idx]
248 | std::views::transform(
249 [this](int64_t i)
250 {
251 if (i == -1)
252 return std::string{};
253 else
254 return user_bin_filenames[i];
255 });
256 }
257
259 int64_t filename_index(size_t const ibf_idx, size_t const bin_idx) const
260 {
261 return ibf_bin_to_filename_position[ibf_idx][bin_idx];
262 }
263
269 template <typename stream_t>
270 void write_filenames(stream_t & out_stream) const
271 {
272 size_t position{};
273 std::string line{};
274 for (auto const & filename : user_bin_filenames)
275 {
276 line.clear();
277 line = '#';
278 line += std::to_string(position);
279 line += '\t';
280 line += filename;
281 line += '\n';
282 out_stream << line;
283 ++position;
284 }
285 }
286
295 template <typename archive_t>
296 void serialize(archive_t & archive)
297 {
298 archive(user_bin_filenames);
299 archive(ibf_bin_to_filename_position);
300 }
302};
303
309template <seqan3::data_layout data_layout_mode> // TODO: value_t as template?
311{
312private:
315
317 hibf_t const * const hibf_ptr{nullptr};
318
320 template <std::ranges::forward_range value_range_t>
321 void bulk_contains_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
322 {
323 auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<uint16_t>();
324 auto & result = agent.bulk_count(values);
325
326 uint16_t sum{};
327
328 for (size_t bin{}; bin < result.size(); ++bin)
329 {
330 sum += result[bin];
331
332 auto const current_filename_index = hibf_ptr->user_bins.filename_index(ibf_idx, bin);
333
334 if (current_filename_index < 0) // merged bin
335 {
336 // if (sum >= threshold)
337 bulk_contains_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
338 sum = 0u;
339 }
340 else if (bin + 1u == result.size() || // last bin
341 current_filename_index != hibf_ptr->user_bins.filename_index(ibf_idx, bin + 1)) // end of split bin
342 {
343 if (sum >= threshold)
344 result_buffer.emplace_back(current_filename_index);
345 sum = 0u;
346 }
347 }
348 }
349
350public:
354 membership_agent() = default;
359 ~membership_agent() = default;
360
365 explicit membership_agent(hibf_t const & hibf) : hibf_ptr(std::addressof(hibf))
366 {}
368
371
389 template <std::ranges::forward_range value_range_t>
390 [[nodiscard]] std::vector<int64_t> const & bulk_contains(value_range_t && values, size_t const threshold) & noexcept
391 {
392 assert(hibf_ptr != nullptr);
393
394 static_assert(std::ranges::forward_range<value_range_t>, "The values must model forward_range.");
395 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
396 "An individual value must be an unsigned integral.");
397
398 result_buffer.clear();
399
400 bulk_contains_impl(values, 0, threshold);
401
402 std::ranges::sort(result_buffer); // TODO: necessary?
403
404 return result_buffer;
405 }
406
407 // `bulk_contains` cannot be called on a temporary, since the object the returned reference points to
408 // is immediately destroyed.
409 template <std::ranges::range value_range_t>
410 [[nodiscard]] std::vector<int64_t> const & bulk_contains(value_range_t && values,
411 size_t const threshold) && noexcept = delete;
413};
414
415#if RAPTOR_HIBF_HAS_COUNT
418template <seqan3::data_layout data_layout_mode>
419template <std::integral value_t>
421{
422private:
425
427 hibf_t const * const hibf_ptr{nullptr};
428
430 template <std::ranges::forward_range value_range_t>
431 void bulk_count_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
432 {
433 auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<value_t>();
434 auto & result = agent.bulk_count(values);
435
436 value_t sum{};
437
438 for (size_t bin{}; bin < result.size(); ++bin)
439 {
440 sum += result[bin];
441 auto const current_filename_index = hibf_ptr->user_bins.filename_index(ibf_idx, bin);
442
443 if (current_filename_index < 0) // merged bin
444 {
445 if (sum >= threshold)
446 bulk_count_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
447 sum = 0u;
448 }
449 else if (bin + 1u == result.size() || // last bin
450 current_filename_index != hibf_ptr->user_bins.filename_index(ibf_idx, bin + 1)) // end of split bin
451 {
452 if (sum >= threshold)
453 result_buffer[current_filename_index] = sum;
454 sum = 0u;
455 }
456 }
457 }
458
459public:
469
474 explicit counting_agent_type(hibf_t const & hibf) :
475 hibf_ptr(std::addressof(hibf)),
476 result_buffer(hibf_ptr->user_bins.num_user_bins())
477 {}
479
481 seqan3::counting_vector<value_t> result_buffer;
482
502 template <std::ranges::forward_range value_range_t>
503 [[nodiscard]] seqan3::counting_vector<value_t> const & bulk_count(value_range_t && values,
504 size_t const threshold = 1u) & noexcept
505 {
506 assert(hibf_ptr != nullptr);
507 assert(threshold > 0u);
508 assert(result_buffer.size() == hibf_ptr->user_bins.num_user_bins());
509
510 static_assert(std::ranges::forward_range<value_range_t>, "The values must model forward_range.");
511 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
512 "An individual value must be an unsigned integral.");
513
514 std::ranges::fill(result_buffer, static_cast<value_t>(0u));
515
516 bulk_count_impl(values, 0, threshold);
517
518 return result_buffer;
519 }
520
521 // `bulk_count` cannot be called on a temporary, since the object the returned reference points to
522 // is immediately destroyed.
523 template <std::ranges::range value_range_t>
524 [[nodiscard]] seqan3::counting_vector<value_t> const & bulk_count(value_range_t && values,
525 size_t const threshold = 1u) && noexcept = delete;
527};
528#endif // RAPTOR_HIBF_HAS_COUNT
529
530} // namespace raptor
Manages counting ranges of values for the raptor::hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:421
counting_agent_type(hibf_t const &hibf)
Construct a counting_agent_type for an existing hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:474
counting_agent_type & operator=(counting_agent_type const &)=default
Defaulted.
counting_agent_type & operator=(counting_agent_type &&)=default
Defaulted.
void bulk_count_impl(value_range_t &&values, int64_t const ibf_idx, size_t const threshold)
Helper for recursive bulk counting.
Definition: hierarchical_interleaved_bloom_filter.hpp:431
counting_agent_type(counting_agent_type &&)=default
Defaulted.
seqan3::counting_vector< value_t > const & bulk_count(value_range_t &&values, size_t const threshold=1u) &noexcept
Counts the occurrences in each bin for all values in a range.
Definition: hierarchical_interleaved_bloom_filter.hpp:503
counting_agent_type(counting_agent_type const &)=default
Defaulted.
seqan3::counting_vector< value_t > result_buffer
Stores the result of bulk_count().
Definition: hierarchical_interleaved_bloom_filter.hpp:481
Manages membership queries for the raptor::hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:311
membership_agent(hibf_t const &hibf)
Construct a membership_agent for an existing hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:365
void bulk_contains_impl(value_range_t &&values, int64_t const ibf_idx, size_t const threshold)
Helper for recursive membership querying.
Definition: hierarchical_interleaved_bloom_filter.hpp:321
membership_agent & operator=(membership_agent const &)=default
Defaulted.
membership_agent(membership_agent const &)=default
Defaulted.
membership_agent & operator=(membership_agent &&)=default
Defaulted.
std::vector< int64_t > const & bulk_contains(value_range_t &&values, size_t const threshold) &noexcept
Determines set membership of given values, and returns the user bin indices of occurrences.
Definition: hierarchical_interleaved_bloom_filter.hpp:390
std::vector< int64_t > result_buffer
Stores the result of bulk_contains().
Definition: hierarchical_interleaved_bloom_filter.hpp:370
membership_agent(membership_agent &&)=default
Defaulted.
Bookkeeping for user and technical bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:176
int64_t filename_index(size_t const ibf_idx, size_t const bin_idx) const
Returns the filename index of the ibf_idxth IBF for bin bin_idx.
Definition: hierarchical_interleaved_bloom_filter.hpp:259
void set_ibf_count(size_t const size)
Changes the number of managed IBFs.
Definition: hierarchical_interleaved_bloom_filter.hpp:197
void set_user_bin_count(size_t const size)
Changes the number of managed user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:203
auto operator[](size_t const ibf_idx) const
Returns a view over the user bin filenames for the ibf_idxth IBF. An empty string is returned for mer...
Definition: hierarchical_interleaved_bloom_filter.hpp:245
void write_filenames(stream_t &out_stream) const
Writes all filenames to a stream. Index and filename are tab-separated.
Definition: hierarchical_interleaved_bloom_filter.hpp:270
std::string const & operator[](std::pair< size_t, size_t > const &index_pair) const
For a pair (a,b), returns a const reference to the filename of the user bin at IBF a,...
Definition: hierarchical_interleaved_bloom_filter.hpp:237
std::string & filename_of_user_bin(size_t const idx)
Returns the filename of the idxth user bin.
Definition: hierarchical_interleaved_bloom_filter.hpp:231
void serialize(archive_t &archive)
Serialisation support function.
Definition: hierarchical_interleaved_bloom_filter.hpp:296
size_t num_user_bins() const noexcept
Returns the number of managed user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:191
std::vector< int64_t > & bin_indices_of_ibf(size_t const idx)
Returns a vector containing user bin indices for each bin in the idxth IBF.
Definition: hierarchical_interleaved_bloom_filter.hpp:217
std::vector< std::string > user_bin_filenames
Contains filenames of all user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:179
The HIBF binning directory. A data structure that efficiently answers set-membership queries for mult...
Definition: hierarchical_interleaved_bloom_filter.hpp:88
counting_agent_type< value_t > counting_agent() const
Returns a counting_agent_type to be used for counting.
Definition: hierarchical_interleaved_bloom_filter.hpp:148
hierarchical_interleaved_bloom_filter(hierarchical_interleaved_bloom_filter const &)=default
Defaulted.
~hierarchical_interleaved_bloom_filter()=default
Defaulted.
std::vector< ibf_t > ibf_vector
The individual interleaved Bloom filters.
Definition: hierarchical_interleaved_bloom_filter.hpp:123
hierarchical_interleaved_bloom_filter & operator=(hierarchical_interleaved_bloom_filter const &)=default
Defaulted.
membership_agent membership_agent() const
Returns a membership_agent to be used for counting.
Definition: hierarchical_interleaved_bloom_filter.hpp:138
static constexpr seqan3::data_layout data_layout_mode
Indicates whether the Interleaved Bloom Filter is compressed.
Definition: hierarchical_interleaved_bloom_filter.hpp:103
hierarchical_interleaved_bloom_filter & operator=(hierarchical_interleaved_bloom_filter &&)=default
Defaulted.
std::vector< std::vector< int64_t > > next_ibf_id
Stores for each bin in each IBF of the HIBF the ID of the next IBF.
Definition: hierarchical_interleaved_bloom_filter.hpp:132
hierarchical_interleaved_bloom_filter(hierarchical_interleaved_bloom_filter &&)=default
Defaulted.
seqan3::interleaved_bloom_filter< data_layout_mode_ > ibf_t
The type of an individual Bloom filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:106
hierarchical_interleaved_bloom_filter()=default
Defaulted.
user_bins user_bins
The underlying user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:135
T clear(T... args)
T fill(T... args)
T resize(T... args)
T size(T... args)
T sort(T... args)
T to_string(T... args)