Raptor 3.0.0-rc.1
A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences
 
search_singular_ibf.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/search/views/minimiser_hash.hpp>
16
23
24namespace raptor
25{
26
27template <typename index_t>
28void search_singular_ibf(search_arguments const & arguments, index_t && index)
29{
30 constexpr bool is_ibf = std::same_as<index_t, raptor_index<index_structure::ibf>>
31 || std::same_as<index_t, raptor_index<index_structure::ibf_compressed>>;
32
33 auto cereal_worker = [&]()
34 {
35 load_index(index, arguments);
36 };
37 auto cereal_handle = std::async(std::launch::async, cereal_worker);
38
39 seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::id, seqan3::field::seq>> fin{
40 arguments.query_file};
41 using record_type = typename decltype(fin)::record_type;
43
44 sync_out synced_out{arguments};
45
46 raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()};
47
48 auto worker = [&](size_t const start, size_t const end)
49 {
50 timer<concurrent::no> local_compute_minimiser_timer{};
51 timer<concurrent::no> local_query_ibf_timer{};
52 timer<concurrent::no> local_generate_results_timer{};
53
54 auto counter = [&index]()
55 {
56 if constexpr (is_ibf)
57 return index.ibf().template counting_agent<uint16_t>();
58 else
59 return index.ibf().membership_agent();
60 }();
61 std::string result_string{};
62 std::vector<uint64_t> minimiser;
63
64 auto hash_adaptor = seqan3::views::minimiser_hash(arguments.shape,
65 seqan3::window_size{arguments.window_size},
66 seqan3::seed{adjust_seed(arguments.shape_weight)});
67
68 for (auto && [id, seq] : records | seqan3::views::slice(start, end))
69 {
70 result_string.clear();
71 result_string += id;
72 result_string += '\t';
73
74 auto minimiser_view = seq | hash_adaptor | std::views::common;
75 local_compute_minimiser_timer.start();
76 minimiser.assign(minimiser_view.begin(), minimiser_view.end());
77 local_compute_minimiser_timer.stop();
78
79 size_t const minimiser_count{minimiser.size()};
80 size_t const threshold = thresholder.get(minimiser_count);
81
82 if constexpr (is_ibf)
83 {
84 local_query_ibf_timer.start();
85 auto & result = counter.bulk_count(minimiser);
86 local_query_ibf_timer.stop();
87 size_t current_bin{0};
88 local_generate_results_timer.start();
89 for (auto && count : result)
90 {
91 if (count >= threshold)
92 {
93 result_string += std::to_string(current_bin);
94 result_string += ',';
95 }
96 ++current_bin;
97 }
98 }
99 else
100 {
101 local_query_ibf_timer.start();
102 auto & result = counter.bulk_contains(minimiser, threshold); // Results contains user bin IDs
103 local_query_ibf_timer.stop();
104 local_generate_results_timer.start();
105 for (auto && count : result)
106 {
107 result_string += std::to_string(count);
108 result_string += ',';
109 }
110 }
111
112 if (auto & last_char = result_string.back(); last_char == ',')
113 last_char = '\n';
114 else
115 result_string += '\n';
116
117 synced_out.write(result_string);
118 local_generate_results_timer.stop();
119 }
120
121 arguments.compute_minimiser_timer += local_compute_minimiser_timer;
122 arguments.query_ibf_timer += local_query_ibf_timer;
123 arguments.generate_results_timer += local_generate_results_timer;
124 };
125
126 for (auto && chunked_records : fin | seqan3::views::chunk((1ULL << 20) * 10))
127 {
128 records.clear();
129 arguments.query_file_io_timer.start();
130 std::ranges::move(chunked_records, std::back_inserter(records));
131 arguments.query_file_io_timer.stop();
132
133 cereal_handle.wait();
134
135 do_parallel(worker, records.size(), arguments.threads);
136 }
137}
138
139} // namespace raptor
Provides raptor::adjust_seed.
T async(T... args)
T back_inserter(T... args)
Definition: threshold.hpp:22
Provides raptor::dna4_traits.
Provides raptor::do_parallel.
T end(T... args)
Provides raptor::load_index.
T move(T... args)
Provides raptor::sync_out.
Provides raptor::threshold::threshold.
T to_string(T... args)