SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
 
Loading...
Searching...
No Matches
cigar_from_alignment.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <vector>
16
20
21namespace seqan3
22{
23
30{
31 uint32_t hard_front{};
32 uint32_t hard_back{};
33 uint32_t soft_front{};
34 uint32_t soft_back{};
35};
36
113template <typename alignment_type>
114inline auto cigar_from_alignment(alignment_type const & alignment,
115 cigar_clipped_bases const & clipped_bases = {},
116 bool const extended_cigar = false)
117{
119 && std::tuple_size_v<std::remove_cvref_t<alignment_type>> == 2),
120 "The alignment must be a std::pair or std::tuple of size 2.");
121
122 static_assert(
123 (std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(alignment))>>
124 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(alignment))>>),
125 "The alignment must contain two ranges whose value_type is comparable to seqan3::gap.");
126
127 /*brief: Compares two seqan3::aligned_sequence values and returns their cigar operation.
128 * param reference_char The aligned character of the reference to compare.
129 * param query_char The aligned character of the query to compare.
130 * param extended_cigar Whether to print the extended cigar alphabet or not. See cigar operation.
131 * returns A seqan3::cigar::operation representing the alignment operation between the two values.
132 *
133 * The resulting CIGAR operation is based on the query character (`query_char`).
134 *
135 * ### Example:
136 *
137 * The following alignment column shows the reference char ('C') on top and a
138 * gap for the query char at the bottom.
139 * ```
140 * ... C ...
141 * |
142 * ... - ...
143 * ```
144 * In this case, `to_cigar_op` will return
145 * 'D' since the query char is "deleted".
146 *
147 * The next alignment column shows the reference char ('C') on top and a
148 * query char ('G') at the bottom.
149 * ```
150 * ... C ...
151 * |
152 * ... G ...
153 * ```
154 * In this case, `to_cigar_op` will return
155 * 'M', for the basic CIGAR the two bases are aligned, while
156 * in the extended CIGAR alphabet (`extended_cigar` = `true`) the function
157 * will return an 'X' since the bases are aligned but are not
158 * equal.
159 * \sa seqan3::aligned_sequence
160 */
161 auto to_cigar_op = [extended_cigar](auto const reference_char, auto const query_char)
162 {
163 // note that N is not considered because it is equivalent to D but has a special meaning:
164 // SAM spec: "For mRNA-to-genome alignment, an N operation represents an intron. For other types of alignments,
165 // the interpretation of N is not defined."
166 // as we cannot know the meaning, the user has to change D -> N themself
167 constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators.
168
169 // no gaps -> 00 -> 0
170 // gap only in query -> 01 -> 1
171 // gap only in reference -> 10 -> 2
172 // both gaps -> 11 -> 3
173 uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{});
174
175 // mismatch -> 100 -> 4
176 // match -> 101 -> 5
177 if (extended_cigar && (key == 0)) // in extended format, refine the substitution operator to match/mismatch.
178 key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].
179
180 return assign_char_to(operators[key], cigar::operation{});
181 };
182
183 using std::get;
184
185 auto & ref_seq = get<0>(alignment);
186 auto & query_seq = get<1>(alignment);
187
188 if (ref_seq.size() != query_seq.size())
189 throw std::logic_error{"The aligned sequences (including gaps) must have the same length."};
190
191 if (std::ranges::empty(ref_seq)) // only check ref_seq because query_seq was checked to have to same size
192 throw std::logic_error{"The aligned sequences may not be empty."};
193
194 std::vector<cigar> result{};
195
196 // Add (H)ard-clipping at the start of the query
197 if (clipped_bases.hard_front)
198 result.emplace_back(clipped_bases.hard_front, 'H'_cigar_operation);
199
200 // Add (S)oft-clipping at the start of the query
201 if (clipped_bases.soft_front)
202 result.emplace_back(clipped_bases.soft_front, 'S'_cigar_operation);
203
204 // Create cigar string from alignment
205 // -------------------------------------------------------------------------
206 // initialize first operation and count value:
207 cigar::operation operation{to_cigar_op(ref_seq[0], query_seq[0])};
208 uint32_t count{0};
209
210 // go through alignment columns
211 for (auto && [reference_char, query_char] : views::zip(ref_seq, query_seq))
212 {
213 cigar::operation next_op = to_cigar_op(reference_char, query_char);
214
215 if (operation == next_op)
216 {
217 ++count;
218 }
219 else
220 {
221 result.emplace_back(count, operation);
222 operation = next_op;
223 count = 1;
224 }
225 }
226
227 // append last cigar element
228 result.emplace_back(count, operation);
229
230 // Add (S)oft-clipping at the end of the query
231 if (clipped_bases.soft_back)
232 result.emplace_back(clipped_bases.soft_back, 'S'_cigar_operation);
233
234 // Add (H)ard-clipping at the end of the query
235 if (clipped_bases.hard_back)
236 result.emplace_back(clipped_bases.hard_back, 'H'_cigar_operation);
237
238 return result;
239}
240
241} // namespace seqan3
Provides the seqan3::cigar alphabet.
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: alphabet/cigar/cigar.hpp:96
T emplace_back(T... args)
Provides seqan3::gap.
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: alphabet/concept.hpp:524
auto cigar_from_alignment(alignment_type const &alignment, cigar_clipped_bases const &clipped_bases={}, bool const extended_cigar=false)
Creates a CIGAR string (SAM format) given a seqan3::detail::pairwise_alignment represented by two seq...
Definition: cigar_from_alignment.hpp:114
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
@ alignment
The (pairwise) alignment stored in an object that models seqan3::detail::pairwise_alignment.
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr auto zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition: zip.hpp:573
Whether a type behaves like a tuple.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Helper struct to specialise soft and hard clipping when using seqan3::cigar_from_alignment.
Definition: cigar_from_alignment.hpp:30
uint32_t hard_front
The number of hard clipped bases at the front of the CIGAR string.
Definition: cigar_from_alignment.hpp:31
uint32_t soft_back
The number of soft clipped bases at the back of the CIGAR string.
Definition: cigar_from_alignment.hpp:34
uint32_t hard_back
The number of hard clipped bases at the back of the CIGAR string.
Definition: cigar_from_alignment.hpp:32
uint32_t soft_front
The number of soft clipped bases at the front of the CIGAR string.
Definition: cigar_from_alignment.hpp:33
Provides seqan3::views::zip.