Raptor 3.0.0-rc.1
A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences
 
forward_strand_minimiser.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <deque>
16
17#include <seqan3/alphabet/nucleotide/dna4.hpp>
18#include <seqan3/search/views/kmer_hash.hpp>
19
22
23namespace raptor::threshold
24{
25
26// Minimiser without looking at reverse complement
28{
29private:
31 uint64_t window_size{};
33 seqan3::shape shape{};
35 uint8_t shape_size{};
37 uint64_t seed{};
40
41public:
44
51
56 forward_strand_minimiser(window const window_size_, seqan3::shape const shape_) :
57 window_size{window_size_.v},
58 shape{shape_},
59 shape_size{shape.size()},
60 seed{adjust_seed(shape.count())}
61 {
62 assert(window_size >= shape_size);
63 }
64
69 void resize(window const window_size_, seqan3::shape const shape_)
70 {
71 window_size = window_size_.v;
72 shape = shape_;
73 shape_size = shape.size();
74 seed = adjust_seed(shape.count());
75 assert(window_size >= shape_size);
76 }
77
78 void compute(std::vector<seqan3::dna4> const & text)
79 {
80 assert(window_size && shape_size && seed); // Forgot to initialise/resize?
81
82 size_t const text_length = text.size();
83 assert(shape_size <= text_length);
84 assert(window_size <= text_length);
85
86 uint64_t const max_number_of_minimiser = text_length - window_size + 1u;
87 uint64_t const kmers_per_window = window_size - shape_size + 1u;
88
90 minimiser_begin.reserve(max_number_of_minimiser);
91
92 // Compute all k-mer hashes.
93 auto apply_xor = [this](uint64_t const value)
94 {
95 return value ^ seed;
96 };
97 auto kmer_view = text | seqan3::views::kmer_hash(shape) | std::views::transform(apply_xor);
98 forward_hashes.assign(kmer_view.begin(), kmer_view.end());
99
100 // Stores hash and begin for all k-mers in the window
102
103 // Initialisation. We need to compute all hashes for the first window.
104 for (uint64_t i = 0; i < kmers_per_window; ++i)
105 window_hashes.emplace_back(forward_hashes[i], i);
106
107 // The minimum hash is the minimiser. Store the begin position.
108 auto min = std::min_element(std::begin(window_hashes), std::end(window_hashes));
109 minimiser_begin.push_back(min->second);
110
111 // For the following windows, we remove the first window k-mer (is now not in window) and add the new k-mer
112 // that results from the window shifting.
113 for (uint64_t i = kmers_per_window; i < max_number_of_minimiser; ++i)
114 {
115 // Already store the new hash without removing the first one.
116 uint64_t const new_hash{forward_hashes[i + kmers_per_window - 1]}; // Already did kmers_per_window - 1 many
117 window_hashes.emplace_back(new_hash, i);
118
119 if (new_hash < min->second) // New hash is the minimum.
120 {
121 min = std::prev(std::end(window_hashes));
122 minimiser_begin.push_back(min->second);
123 }
124 else if (min == std::begin(window_hashes)) // Minimum is the yet-to-be-removed begin of the window.
125 {
126 // The first hash will be removed, the last one is caught by the previous if.
127 min = std::min_element(++std::begin(window_hashes), std::prev(std::end(window_hashes)));
129 }
130
131 window_hashes.pop_front(); // Remove the first k-mer.
132 }
133 return;
134 }
135};
136
137} // namespace raptor::threshold
Provides raptor::adjust_seed.
T assign(T... args)
T begin(T... args)
T clear(T... args)
T emplace_back(T... args)
T end(T... args)
T min_element(T... args)
T min(T... args)
T pop_front(T... args)
T prev(T... args)
T push_back(T... args)
T reserve(T... args)
T size(T... args)
Provides raptor::window.
Definition: forward_strand_minimiser.hpp:28
uint64_t window_size
The window size of the minimiser.
Definition: forward_strand_minimiser.hpp:31
forward_strand_minimiser(window const window_size_, seqan3::shape const shape_)
Constructs a minimiser from given k-mer, window size and a seed.
Definition: forward_strand_minimiser.hpp:56
std::vector< uint64_t > forward_hashes
Stores the k-mer hashes of the forward strand.
Definition: forward_strand_minimiser.hpp:39
forward_strand_minimiser(forward_strand_minimiser &&)=default
Defaulted.
forward_strand_minimiser & operator=(forward_strand_minimiser const &)=default
Defaulted.
uint64_t seed
Random but fixed value to xor k-mers with. Counteracts consecutive minimisers.
Definition: forward_strand_minimiser.hpp:37
forward_strand_minimiser & operator=(forward_strand_minimiser &&)=default
Defaulted.
void resize(window const window_size_, seqan3::shape const shape_)
Resize the minimiser.
Definition: forward_strand_minimiser.hpp:69
uint8_t shape_size
The size of the shape.
Definition: forward_strand_minimiser.hpp:35
std::vector< uint64_t > minimiser_begin
Stores the begin positions of the minimisers.
Definition: forward_strand_minimiser.hpp:43
seqan3::shape shape
The shape to use.
Definition: forward_strand_minimiser.hpp:33
forward_strand_minimiser(forward_strand_minimiser const &)=default
Defaulted.
Strong type for passing the window size.
Definition: strong_types.hpp:22