SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
 
Loading...
Searching...
No Matches
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <bit>
16#include <cstring>
17#include <iterator>
18#include <ranges>
19#include <string>
20#include <vector>
21
34
35namespace seqan3
36{
37
51{
52public:
56 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
57 format_bam() = default;
58 format_bam(format_bam const &) = default;
59 format_bam & operator=(format_bam const &) = default;
60 format_bam(format_bam &&) = default;
62 ~format_bam() = default;
63
65
68
69protected:
70 template <typename stream_type, // constraints checked by file
71 typename seq_legal_alph_type,
72 typename ref_seqs_type,
73 typename ref_ids_type,
74 typename stream_pos_type,
75 typename seq_type,
76 typename id_type,
77 typename ref_seq_type,
78 typename ref_id_type,
79 typename ref_offset_type,
80 typename cigar_type,
81 typename flag_type,
82 typename mapq_type,
83 typename qual_type,
84 typename mate_type,
85 typename tag_dict_type,
86 typename e_value_type,
87 typename bit_score_type>
88 void read_alignment_record(stream_type & stream,
89 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
90 ref_seqs_type & ref_seqs,
92 stream_pos_type & position_buffer,
93 seq_type & seq,
94 qual_type & qual,
95 id_type & id,
96 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
97 ref_id_type & ref_id,
98 ref_offset_type & ref_offset,
99 cigar_type & cigar_vector,
100 flag_type & flag,
101 mapq_type & mapq,
102 mate_type & mate,
103 tag_dict_type & tag_dict,
104 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
105 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
106
107 template <typename stream_type,
108 typename header_type,
109 typename seq_type,
110 typename id_type,
111 typename ref_seq_type,
112 typename ref_id_type,
113 typename cigar_type,
114 typename qual_type,
115 typename mate_type,
116 typename tag_dict_type>
117 void write_alignment_record([[maybe_unused]] stream_type & stream,
118 [[maybe_unused]] sam_file_output_options const & options,
119 [[maybe_unused]] header_type && header,
120 [[maybe_unused]] seq_type && seq,
121 [[maybe_unused]] qual_type && qual,
122 [[maybe_unused]] id_type && id,
123 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
124 [[maybe_unused]] ref_id_type && ref_id,
125 [[maybe_unused]] std::optional<int32_t> ref_offset,
126 [[maybe_unused]] cigar_type && cigar_vector,
127 [[maybe_unused]] sam_flag const flag,
128 [[maybe_unused]] uint8_t const mapq,
129 [[maybe_unused]] mate_type && mate,
130 [[maybe_unused]] tag_dict_type && tag_dict,
131 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
132 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
133
135 template <typename stream_t, typename header_type>
136 void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
137
138private:
140 bool header_was_read{false};
141
144
147 { // naming corresponds to official SAM/BAM specifications
148 int32_t block_size;
149 int32_t refID;
150 int32_t pos;
151 uint32_t l_read_name : 8;
152 uint32_t mapq : 8;
153 uint32_t bin : 16;
154 uint32_t n_cigar_op : 16;
156 int32_t l_seq;
157 int32_t next_refID;
158 int32_t next_pos;
159 int32_t tlen;
160 };
161
162 static_assert(sizeof(alignment_record_core) == 36);
163
164 // clang-format off
167 {
168 []() constexpr {
170
171 using index_t = std::make_unsigned_t<char>;
172
173 // ret['M'] = 0; set anyway by initialization
174 ret[static_cast<index_t>('I')] = 1;
175 ret[static_cast<index_t>('D')] = 2;
176 ret[static_cast<index_t>('N')] = 3;
177 ret[static_cast<index_t>('S')] = 4;
178 ret[static_cast<index_t>('H')] = 5;
179 ret[static_cast<index_t>('P')] = 6;
180 ret[static_cast<index_t>('=')] = 7;
181 ret[static_cast<index_t>('X')] = 8;
182
183 return ret;
184 }()
185 };
186 // clang-format on
187
189 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
190 {
191 --end;
192 if (beg >> 14 == end >> 14)
193 return ((1 << 15) - 1) / 7 + (beg >> 14);
194 if (beg >> 17 == end >> 17)
195 return ((1 << 12) - 1) / 7 + (beg >> 17);
196 if (beg >> 20 == end >> 20)
197 return ((1 << 9) - 1) / 7 + (beg >> 20);
198 if (beg >> 23 == end >> 23)
199 return ((1 << 6) - 1) / 7 + (beg >> 23);
200 if (beg >> 26 == end >> 26)
201 return ((1 << 3) - 1) / 7 + (beg >> 26);
202 return 0;
203 }
204
211 template <typename stream_view_type, std::integral number_type>
212 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
213 {
214 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
215 }
216
218 template <std::integral number_type>
219 void read_integral_byte_field(std::string_view const str, number_type & target)
220 {
221 std::memcpy(&target, str.data(), sizeof(target));
222 }
223
229 template <typename stream_view_type>
230 void read_float_byte_field(stream_view_type && stream_view, float & target)
231 {
232 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
233 }
234
235 template <typename value_type>
237 std::string_view const str,
238 value_type const & SEQAN3_DOXYGEN_ONLY(value));
239
240 void read_sam_dict(std::string_view const tag_str, sam_tag_dictionary & target);
241
243
244 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
245};
246
248template <typename stream_type, // constraints checked by file
249 typename seq_legal_alph_type,
250 typename ref_seqs_type,
251 typename ref_ids_type,
252 typename stream_pos_type,
253 typename seq_type,
254 typename id_type,
255 typename ref_seq_type,
256 typename ref_id_type,
257 typename ref_offset_type,
258 typename cigar_type,
259 typename flag_type,
260 typename mapq_type,
261 typename qual_type,
262 typename mate_type,
263 typename tag_dict_type,
264 typename e_value_type,
265 typename bit_score_type>
266inline void
268 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
269 ref_seqs_type & ref_seqs,
271 stream_pos_type & position_buffer,
272 seq_type & seq,
273 qual_type & qual,
274 id_type & id,
275 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
276 ref_id_type & ref_id,
277 ref_offset_type & ref_offset,
278 cigar_type & cigar_vector,
279 flag_type & flag,
280 mapq_type & mapq,
281 mate_type & mate,
282 tag_dict_type & tag_dict,
283 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
284 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
285{
286 static_assert(detail::decays_to_ignore_v<ref_offset_type>
287 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
288 "The ref_offset must be a specialisation of std::optional.");
289
290 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
291 "The type of field::mapq must be uint8_t.");
292
293 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
294 "The type of field::flag must be seqan3::sam_flag.");
295
296 auto stream_view = seqan3::detail::istreambuf(stream);
297
298 // Header
299 // -------------------------------------------------------------------------------------------------------------
300 if (!header_was_read)
301 {
302 // magic BAM string
304 throw format_error{"File is not in BAM format."};
305
306 int32_t l_text{}; // length of header text including \0 character
307 int32_t n_ref{}; // number of reference sequences
308 int32_t l_name{}; // 1 + length of reference name including \0 character
309 int32_t l_ref{}; // length of reference sequence
310
311 read_integral_byte_field(stream_view, l_text);
312
313 if (l_text > 0) // header text is present
314 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
315
316 read_integral_byte_field(stream_view, n_ref);
317
318 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
319 {
320 read_integral_byte_field(stream_view, l_name);
321
322 string_buffer.resize(l_name - 1);
324 l_name - 1,
325 string_buffer.data()); // copy without \0 character
326 ++std::ranges::begin(stream_view); // skip \0 character
327
328 read_integral_byte_field(stream_view, l_ref);
329
330 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
331 {
332 // If there was no header text, we parse reference sequences block as header information
333 if (l_text == 0)
334 {
335 auto & reference_ids = header.ref_ids();
336 // put the length of the reference sequence into ref_id_info
337 header.ref_id_info.emplace_back(l_ref, "");
338 // put the reference name into reference_ids
339 reference_ids.push_back(string_buffer);
340 // assign the reference name an ascending reference id (starts at index 0).
341 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
342 continue;
343 }
344 }
345
346 auto id_it = header.ref_dict.find(string_buffer);
347
348 // sanity checks of reference information to existing header object:
349 if (id_it == header.ref_dict.end()) // [unlikely]
350 {
351 throw format_error{detail::to_string("Unknown reference name '" + string_buffer
352 + "' found in BAM file header (header.ref_ids():",
353 header.ref_ids(),
354 ").")};
355 }
356 else if (id_it->second != ref_idx) // [unlikely]
357 {
358 throw format_error{detail::to_string("Reference id '",
360 "' at position ",
361 ref_idx,
362 " does not correspond to the position ",
363 id_it->second,
364 " in the header (header.ref_ids():",
365 header.ref_ids(),
366 ").")};
367 }
368 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
369 {
370 throw format_error{"Provided reference has unequal length as specified in the header."};
371 }
372 }
373
374 header_was_read = true;
375
376 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
377 return;
378 }
379
380 // read alignment record into buffer
381 // -------------------------------------------------------------------------------------------------------------
382 position_buffer = stream.tellg();
383
384 auto stream_it = detail::fast_istreambuf_iterator{*stream.rdbuf()};
385
387 std::string_view const core_str = stream_it.cache_bytes(sizeof(core));
388 std::ranges::copy(core_str, reinterpret_cast<char *>(&core));
389
390 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
391 {
392 throw format_error{detail::to_string("Reference id index '",
393 core.refID,
394 "' is not in range of ",
395 "header.ref_ids(), which has size ",
396 header.ref_ids().size(),
397 ".")};
398 }
399 else if (core.refID > -1) // not unmapped
400 {
401 ref_id = core.refID; // field::ref_id
402 }
403
404 flag = core.flag; // field::flag
405 mapq = core.mapq; // field::mapq
406
407 if (core.pos > -1) // [[likely]]
408 ref_offset = core.pos; // field::ref_offset
409
410 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
411 {
412 if (core.next_refID > -1)
413 get<0>(mate) = core.next_refID;
414
415 if (core.next_pos > -1) // [[likely]]
416 get<1>(mate) = core.next_pos;
417
418 get<2>(mate) = core.tlen;
419 }
420
421 // read id
422 // -------------------------------------------------------------------------------------------------------------
423 std::string_view record_str = stream_it.cache_bytes(core.block_size - (sizeof(alignment_record_core) - 4));
424 size_t considered_bytes{0};
425
426 if constexpr (!detail::decays_to_ignore_v<id_type>)
427 read_forward_range_field(record_str.substr(0, core.l_read_name - 1), id);
428
429 considered_bytes += core.l_read_name;
430
431 // read cigar string
432 // -------------------------------------------------------------------------------------------------------------
433 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
434 cigar_vector = parse_binary_cigar(record_str.substr(considered_bytes, core.n_cigar_op * 4));
435
436 considered_bytes += core.n_cigar_op * 4;
437
438 // read sequence
439 // -------------------------------------------------------------------------------------------------------------
440 if constexpr (!detail::decays_to_ignore_v<seq_type>)
441 {
442 size_t const number_of_bytes = (core.l_seq + 1) / 2;
443 std::string_view const seq_str = record_str.substr(considered_bytes, number_of_bytes);
444
445 seq.resize(
446 core.l_seq
447 + 1 /* reserve one more in case size is uneven. will be corrected */); // TODO: .resize() is not generic
448
449 using alph_t = std::ranges::range_value_t<decltype(seq)>;
450 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
451
452 // 1 byte encodes two sequence characters
453 for (size_t i = 0, j = 0; i < number_of_bytes; ++i, j += 2)
454 {
455 seq[j] = from_dna16[to_rank(dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(seq_str[i]) >> 4)))];
456 seq[j + 1] =
457 from_dna16[to_rank(dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(seq_str[i]) & 0x0f)))];
458 }
459
460 seq.resize(core.l_seq); // remove extra letter
461 }
462
463 considered_bytes += (core.l_seq + 1) / 2;
464
465 // read qual string
466 // -------------------------------------------------------------------------------------------------------------
467 if constexpr (!detail::decays_to_ignore_v<qual_type>)
468 {
469 std::string_view const qual_str = record_str.substr(considered_bytes, core.l_seq);
470 qual.resize(core.l_seq); // TODO: this is not generic
471
472 for (int32_t i = 0; i < core.l_seq; ++i)
473 qual[i] = assign_char_to(static_cast<char>(qual_str[i] + 33), std::ranges::range_value_t<qual_type>{});
474 }
475
476 considered_bytes += core.l_seq;
477
478 // All remaining optional fields if any: SAM tags dictionary
479 // -------------------------------------------------------------------------------------------------------------
480 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
481 read_sam_dict(record_str.substr(considered_bytes), tag_dict);
482
483 // DONE READING - wrap up
484 // -------------------------------------------------------------------------------------------------------------
485 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
486 {
487 int32_t const sc_front = soft_clipping_at_front(cigar_vector);
488
489 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
490 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
491 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
492 if (core.l_seq != 0 && sc_front == core.l_seq)
493 {
494 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
495 { // maybe only throw in debug mode and otherwise return an empty alignment?
496 throw format_error{
497 detail::to_string("The cigar string '",
498 detail::get_cigar_string(cigar_vector),
499 "' suggests that the cigar string exceeded 65535 elements and was therefore ",
500 "stored in the optional field CG. You need to read in the field::tags and "
501 "field::seq in order to access this information.")};
502 }
503 else
504 {
505 auto it = tag_dict.find("CG"_tag);
506
507 if (it == tag_dict.end())
508 throw format_error{
509 detail::to_string("The cigar string '",
510 detail::get_cigar_string(cigar_vector),
511 "' suggests that the cigar string exceeded 65535 elements and was therefore ",
512 "stored in the optional field CG but this tag is not present in the given ",
513 "record.")};
514
515 cigar_vector = detail::parse_cigar(std::get<std::string>(it->second));
516 tag_dict.erase(it); // remove redundant information
517 }
518 }
519 }
520}
521
523template <typename stream_type,
524 typename header_type,
525 typename seq_type,
526 typename id_type,
527 typename ref_seq_type,
528 typename ref_id_type,
529 typename cigar_type,
530 typename qual_type,
531 typename mate_type,
532 typename tag_dict_type>
533inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
534 [[maybe_unused]] sam_file_output_options const & options,
535 [[maybe_unused]] header_type && header,
536 [[maybe_unused]] seq_type && seq,
537 [[maybe_unused]] qual_type && qual,
538 [[maybe_unused]] id_type && id,
539 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
540 [[maybe_unused]] ref_id_type && ref_id,
541 [[maybe_unused]] std::optional<int32_t> ref_offset,
542 [[maybe_unused]] cigar_type && cigar_vector,
543 [[maybe_unused]] sam_flag const flag,
544 [[maybe_unused]] uint8_t const mapq,
545 [[maybe_unused]] mate_type && mate,
546 [[maybe_unused]] tag_dict_type && tag_dict,
547 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
548 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
549{
550 // ---------------------------------------------------------------------
551 // Type Requirements (as static asserts for user friendliness)
552 // ---------------------------------------------------------------------
553 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
554 "The seq object must be a std::ranges::forward_range over "
555 "letters that model seqan3::alphabet.");
556
557 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
558 "The id object must be a std::ranges::forward_range over "
559 "letters that model seqan3::alphabet.");
560
561 static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
562 "The ref_seq object must be a std::ranges::forward_range "
563 "over letters that model seqan3::alphabet.");
564
565 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
566 {
567 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
568 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
569 "The ref_id object must be a std::ranges::forward_range "
570 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
571 }
572
573 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
574 "The qual object must be a std::ranges::forward_range "
575 "over letters that model seqan3::alphabet.");
576
578 "The mate object must be a std::tuple of size 3 with "
579 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
580 "2) a std::integral or std::optional<std::integral>, and "
581 "3) a std::integral.");
582
583 static_assert(
584 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
585 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
586 || detail::is_type_specialisation_of_v<
587 std::remove_cvref_t<decltype(std::get<0>(mate))>,
588 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
589 || detail::is_type_specialisation_of_v<
590 std::remove_cvref_t<decltype(std::get<1>(mate))>,
591 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
592 "The mate object must be a std::tuple of size 3 with "
593 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
594 "2) a std::integral or std::optional<std::integral>, and "
595 "3) a std::integral.");
596
597 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
598 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
599
600 if constexpr (detail::decays_to_ignore_v<header_type>)
601 {
602 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
603 "You can either construct the output file with reference names and reference length "
604 "information and the header will be created for you, or you can access the `header` member "
605 "directly."};
606 }
607 else
608 {
609 // ---------------------------------------------------------------------
610 // logical Requirements
611 // ---------------------------------------------------------------------
612
613 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
614 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
615
616 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
617
618 // ---------------------------------------------------------------------
619 // Writing the BAM Header on first call
620 // ---------------------------------------------------------------------
622 {
623 write_header(stream, options, header);
624 header_was_written = true;
625 }
626
627 // ---------------------------------------------------------------------
628 // Writing the Record
629 // ---------------------------------------------------------------------
630 int32_t ref_length{};
631
632 // Compute the ref_length from given cigar_vector which is needed to fill field `bin`.
633 if (!std::ranges::empty(cigar_vector))
634 {
635 int32_t dummy_seq_length{};
636 for (auto & [count, operation] : cigar_vector)
637 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
638 }
639
640 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
641 {
642 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
643 cigar_vector.resize(2);
644 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
645 cigar_vector[1] = cigar{static_cast<uint32_t>(ref_length), 'N'_cigar_operation};
646 }
647
648 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
649
650 // Compute the value for the l_read_name field for the bam record.
651 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
652 // the data type to store the value is uint8_t and 255 is the maximal size.
653 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
654 // 2 bytes.
655 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
656 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
657
658 alignment_record_core core{/* block_size */ 0, // will be initialised right after
659 /* refID */ -1, // will be initialised right after
660 /* pos */ ref_offset.value_or(-1),
661 /* l_read_name */ read_name_size,
662 /* mapq */ mapq,
663 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
664 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
665 /* flag */ flag,
666 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
667 /* next_refId */ -1, // will be initialised right after
668 /* next_pos */ get<1>(mate).value_or(-1),
669 /* tlen */ get<2>(mate)};
670
671 auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
672 {
673 using id_t = std::remove_reference_t<decltype(id_source)>;
674
675 if constexpr (!detail::decays_to_ignore_v<id_t>)
676 {
677 if constexpr (std::integral<id_t>)
678 {
679 id_target = id_source;
680 }
681 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
682 {
683 id_target = id_source.value_or(-1);
684 }
685 else
686 {
687 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
688 {
689 auto id_it = header.ref_dict.end();
690
691 if constexpr (std::ranges::contiguous_range<decltype(id_source)>
692 && std::ranges::sized_range<decltype(id_source)>
693 && std::ranges::borrowed_range<decltype(id_source)>)
694 {
695 id_it = header.ref_dict.find(
696 std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
697 }
698 else
699 {
700 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
701
702 static_assert(
703 implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
704 "The ref_id type is not convertible to the reference id information stored in the "
705 "reference dictionary of the header object.");
706
707 id_it = header.ref_dict.find(id_source);
708 }
709
710 if (id_it == header.ref_dict.end())
711 {
712 throw format_error{detail::to_string("Unknown reference name '",
713 id_source,
714 "' could "
715 "not be found in BAM header ref_dict: ",
716 header.ref_dict,
717 ".")};
718 }
719
720 id_target = id_it->second;
721 }
722 }
723 }
724 };
725
726 // initialise core.refID
727 check_and_assign_id_to(ref_id, core.refID);
728
729 // initialise core.next_refID
730 check_and_assign_id_to(get<0>(mate), core.next_refID);
731
732 // initialise core.block_size
733 core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
734 + // each int32_t has 4 bytes
735 (core.l_seq + 1) / 2 + // bitcompressed seq
736 core.l_seq + // quality string
737 tag_dict_binary_str.size();
738
739 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
740
741 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
742 stream_it = '*';
743 else
744 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
745 stream_it = '\0';
746
747 // write cigar
748 for (auto [cigar_count, op] : cigar_vector)
749 {
750 cigar_count = cigar_count << 4;
751 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
752 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
753 }
754
755 // write seq (bit-compressed: dna16sam characters go into one byte)
756 using alph_t = std::ranges::range_value_t<seq_type>;
757 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
758
759 auto sit = std::ranges::begin(seq);
760 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
761 {
762 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
763 ++sidx, ++sit;
764 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
765 stream_it = static_cast<char>(compressed_chr);
766 }
767
768 if (core.l_seq & 1) // write one more
769 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
770
771 // write qual
772 if (std::ranges::empty(qual))
773 {
774 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
775 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
776 }
777 else
778 {
779 if (std::ranges::distance(qual) != core.l_seq)
780 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
781 core.l_seq,
782 ". Got quality with size ",
783 std::ranges::distance(qual),
784 " instead.")};
785
786 auto v = qual
787 | std::views::transform(
788 [](auto chr)
789 {
790 return static_cast<char>(to_rank(chr));
791 });
792 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
793 }
794
795 // write optional fields
796 stream << tag_dict_binary_str;
797 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
798}
799
801template <typename stream_t, typename header_type>
802inline void format_bam::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
803{
804 if constexpr (detail::decays_to_ignore_v<header_type>)
805 {
806 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
807 "You can either construct the output file with reference names and reference length "
808 "information and the header will be created for you, or you can access the `header` member "
809 "directly."};
810 }
811 else
812 {
813 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
814
815 std::ranges::copy_n("BAM\1", 4, stream_it); // Do not copy the null terminator
816
817 // write SAM header to temporary stream first to query its size.
819 detail::format_sam_base::write_header(os, options, header);
820#if SEQAN3_WORKAROUND_GCC_NO_CXX11_ABI || (SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10))
821 int32_t const l_text{static_cast<int32_t>(os.str().size())};
822#else
823 int32_t const l_text{static_cast<int32_t>(os.view().size())};
824#endif
825 std::ranges::copy_n(reinterpret_cast<char const *>(&l_text), 4, stream_it); // write text length
826
827#if SEQAN3_WORKAROUND_GCC_NO_CXX11_ABI || (SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10))
828 auto header_view = os.str();
829#else
830 auto header_view = os.view();
831#endif
832 std::ranges::copy(header_view, stream_it);
833
834 assert(header.ref_ids().size() < (1ull << 32));
835 int32_t const n_ref{static_cast<int32_t>(header.ref_ids().size())};
836 std::ranges::copy_n(reinterpret_cast<char const *>(&n_ref), 4, stream_it); // write number of references
837
838 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
839 {
840 assert(header.ref_ids()[ridx].size() + 1 < (1ull << 32));
841 int32_t const l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
842 std::ranges::copy_n(reinterpret_cast<char const *>(&l_name), 4, stream_it); // write l_name
843 // write reference name:
844 std::ranges::copy(header.ref_ids()[ridx], stream_it);
845 stream_it = '\0'; // ++ is not necessary for ostream_iterator
846 // write reference sequence length:
847 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
848 }
849 }
850}
851
870template <typename value_type>
872 std::string_view const str,
873 value_type const & SEQAN3_DOXYGEN_ONLY(value))
874{
875 auto it = str.begin();
876
877 // Read vector size from string_view and advance `it`.
878 int32_t const vector_size = [&]()
879 {
880 int32_t size{};
882 it += sizeof(size);
883 return size;
884 }();
885
886 int32_t bytes_left{vector_size};
887
888 std::vector<value_type> tmp_vector;
889 tmp_vector.reserve(vector_size);
890
891 value_type tmp{};
892
893 while (bytes_left > 0)
894 {
895 if constexpr (std::integral<value_type>)
897 else if constexpr (std::same_as<value_type, float>)
899 else
900 static_assert(std::is_same_v<value_type, void>, "format_bam::read_sam_dict_vector: unsupported value_type");
901
902 it += sizeof(tmp);
903 tmp_vector.push_back(std::move(tmp));
904 --bytes_left;
905 }
906
907 variant = std::move(tmp_vector);
908
909 return vector_size;
910}
911
928{
929 /* Every BAM tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
930 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
931 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
932 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
933 by the length (int32_t) of the array, followed by the values.
934 */
935 auto it = tag_str.begin();
936
937 // Deduces int_t from passed argument.
938 auto parse_integer_into_target = [&]<std::integral int_t>(uint16_t const tag, int_t)
939 {
940 int_t tmp{};
941 read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
942 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
943 it += sizeof(tmp);
944 };
945
946 // Deduces array_value_t from passed argument.
947 auto parse_array_into_target = [&]<arithmetic array_value_t>(uint16_t const tag, array_value_t)
948 {
949 int32_t const count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, array_value_t{});
950 it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(array_value_t) * count;
951 };
952
953 // Read uint16_t from string_view and advance `it`.
954 auto parse_tag = [&]()
955 {
956 uint16_t tag = static_cast<uint16_t>(*it) << 8;
957 ++it; // skip char read before
958 tag |= static_cast<uint16_t>(*it);
959 ++it; // skip char read before
960 return tag;
961 };
962
963 while (it != tag_str.end())
964 {
965 uint16_t const tag = parse_tag();
966
967 char const type_id{*it};
968 ++it; // skip char read before
969
970 switch (type_id)
971 {
972 case 'A': // char
973 {
974 target[tag] = *it;
975 ++it; // skip char that has been read
976 break;
977 }
978 // all integer sizes are possible
979 case 'c': // int8_t
980 {
981 parse_integer_into_target(tag, int8_t{});
982 break;
983 }
984 case 'C': // uint8_t
985 {
986 parse_integer_into_target(tag, uint8_t{});
987 break;
988 }
989 case 's': // int16_t
990 {
991 parse_integer_into_target(tag, int16_t{});
992 break;
993 }
994 case 'S': // uint16_t
995 {
996 parse_integer_into_target(tag, uint16_t{});
997 break;
998 }
999 case 'i': // int32_t
1000 {
1001 parse_integer_into_target(tag, int32_t{});
1002 break;
1003 }
1004 case 'I': // uint32_t
1005 {
1006 parse_integer_into_target(tag, uint32_t{});
1007 break;
1008 }
1009 case 'f': // float
1010 {
1011 float tmp{};
1012 read_float_byte_field(std::string_view{it, tag_str.end()}, tmp);
1013 target[tag] = tmp;
1014 it += sizeof(float);
1015 break;
1016 }
1017 case 'Z': // string
1018 {
1019 std::string const v{static_cast<char const *>(it)}; // parses until '\0'
1020 it += v.size() + 1;
1021 target[tag] = std::move(v);
1022 break;
1023 }
1024 case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
1025 {
1026 std::string_view const str{static_cast<char const *>(it)}; // parses until '\0'
1027
1028 std::vector<std::byte> tmp_vector{};
1029 // std::from_chars cannot directly parse into a std::byte
1030 uint8_t dummy_byte{};
1031
1032 if (str.size() % 2 != 0)
1033 throw format_error{"[CORRUPTED BAM FILE] Hexadecimal tag must have even number of digits."};
1034
1035 // H encodes bytes in a hexadecimal format. Two hex values are stored for each byte as characters.
1036 // E.g., '1' and 'A' need one byte each and are read as `\x1A`, which is 27 in decimal.
1037 for (auto hex_begin = str.begin(), hex_end = str.begin() + 2; hex_begin != str.end();
1038 hex_begin += 2, hex_end += 2)
1039 {
1040 auto res = std::from_chars(hex_begin, hex_end, dummy_byte, 16);
1041
1042 if (res.ec == std::errc::invalid_argument)
1043 throw format_error{std::string("[CORRUPTED BAM FILE] The string '")
1044 + std::string(hex_begin, hex_end) + "' could not be cast into type uint8_t."};
1045
1046 if (res.ec == std::errc::result_out_of_range)
1047 throw format_error{std::string("[CORRUPTED BAM FILE] Casting '") + std::string(str)
1048 + "' into type uint8_t would cause an overflow."};
1049
1050 tmp_vector.push_back(std::byte{dummy_byte});
1051 }
1052
1053 target[tag] = std::move(tmp_vector);
1054
1055 it += str.size() + 1;
1056
1057 break;
1058 }
1059 case 'B': // Array. Value type depends on second char [cCsSiIf]
1060 {
1061 char array_value_type_id = *it;
1062 ++it; // skip char read before
1063
1064 switch (array_value_type_id)
1065 {
1066 case 'c': // int8_t
1067 parse_array_into_target(tag, int8_t{});
1068 break;
1069 case 'C': // uint8_t
1070 parse_array_into_target(tag, uint8_t{});
1071 break;
1072 case 's': // int16_t
1073 parse_array_into_target(tag, int16_t{});
1074 break;
1075 case 'S': // uint16_t
1076 parse_array_into_target(tag, uint16_t{});
1077 break;
1078 case 'i': // int32_t
1079 parse_array_into_target(tag, int32_t{});
1080 break;
1081 case 'I': // uint32_t
1082 parse_array_into_target(tag, uint32_t{});
1083 break;
1084 case 'f': // float
1085 parse_array_into_target(tag, float{});
1086 break;
1087 default:
1088 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1089 "must be one of [cCsSiIf] but '",
1090 array_value_type_id,
1091 "' was given.")};
1092 }
1093 break;
1094 }
1095 default:
1096 throw format_error{detail::to_string("The second character in the numerical id of a "
1097 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1098 type_id,
1099 "' was given.")};
1100 }
1101 }
1102}
1103
1110{
1111 // The cigar operation is encoded in 4 bits.
1112 constexpr std::array<char, 16>
1113 cigar_operation_mapping{'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', '*', '*', '*', '*', '*', '*', '*'};
1114 // The rightmost 4 bits encode the operation, the other bits encode the count.
1115 constexpr uint32_t cigar_operation_mask = 0x0f; // rightmost 4 bits are set to one
1116
1117 std::vector<cigar> cigar_vector{};
1118 char operation{'\0'};
1119 uint32_t count{};
1120 uint32_t operation_and_count{}; // In BAM, operation and count values are stored within one 32 bit integer.
1121
1122 assert(cigar_str.size() % 4 == 0); // One cigar letter is stored in 4 bytes (uint32_t).
1123
1124 for (auto it = cigar_str.begin(); it != cigar_str.end(); it += sizeof(operation_and_count))
1125 {
1126 std::memcpy(&operation_and_count, it, sizeof(operation_and_count));
1127 operation = cigar_operation_mapping[operation_and_count & cigar_operation_mask];
1128 count = operation_and_count >> 4;
1129
1130 cigar_vector.emplace_back(count, seqan3::assign_char_strictly_to(operation, cigar::operation{}));
1131 }
1132
1133 return cigar_vector;
1134}
1135
1140{
1141 std::string result{};
1142
1143 auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1144 {
1145 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1146 using T = std::remove_cvref_t<decltype(arg)>;
1147
1148 if constexpr (std::same_as<T, int32_t>)
1149 {
1150 // always choose the smallest possible representation [cCsSiI]
1151 size_t const absolute_arg = std::abs(arg);
1152 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1153 bool const negative = arg < 0;
1154 n = n * n + 2 * negative; // for switch case order
1155
1156 switch (n)
1157 {
1158 case 0:
1159 {
1160 result[result.size() - 1] = 'C';
1161 result.append(reinterpret_cast<char const *>(&arg), 1);
1162 break;
1163 }
1164 case 1:
1165 {
1166 result[result.size() - 1] = 'S';
1167 result.append(reinterpret_cast<char const *>(&arg), 2);
1168 break;
1169 }
1170 case 2:
1171 {
1172 result[result.size() - 1] = 'c';
1173 int8_t tmp = static_cast<int8_t>(arg);
1174 result.append(reinterpret_cast<char const *>(&tmp), 1);
1175 break;
1176 }
1177 case 3:
1178 {
1179 result[result.size() - 1] = 's';
1180 int16_t tmp = static_cast<int16_t>(arg);
1181 result.append(reinterpret_cast<char const *>(&tmp), 2);
1182 break;
1183 }
1184 default:
1185 {
1186 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1187 break;
1188 }
1189 }
1190 }
1191 else if constexpr (std::same_as<T, std::string>)
1192 {
1193 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1194 }
1195 else if constexpr (!std::ranges::range<T>) // char, float
1196 {
1197 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1198 }
1199 else // std::vector of some arithmetic_type type
1200 {
1201 int32_t sz{static_cast<int32_t>(arg.size())};
1202 result.append(reinterpret_cast<char *>(&sz), 4);
1203 result.append(reinterpret_cast<char const *>(arg.data()),
1204 arg.size() * sizeof(std::ranges::range_value_t<T>));
1205 }
1206 };
1207
1208 for (auto & [tag, variant] : tag_dict)
1209 {
1210 result.push_back(static_cast<char>(tag / 256));
1211 result.push_back(static_cast<char>(tag % 256));
1212
1213 result.push_back(detail::sam_tag_type_char[variant.index()]);
1214
1215 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1216 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1217
1218 std::visit(stream_variant_fn, variant);
1219 }
1220
1221 return result;
1222}
1223
1224} // namespace seqan3
T begin(T... args)
T bit_ceil(T... args)
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: alphabet/cigar/cigar.hpp:60
Functionally the same as std::istreambuf_iterator, but faster.
Definition: fast_istreambuf_iterator.hpp:40
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:40
The alignment base format.
Definition: format_sam_base.hpp:45
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition: format_sam_base.hpp:614
int32_t soft_clipping_at_front(std::vector< cigar > const &cigar_vector) const
Returns the soft clipping value at the front of the cigar_vector or 0 if none present.
Definition: format_sam_base.hpp:148
void read_forward_range_field(stream_view_type &&stream_view, target_range_type &target)
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
Definition: format_sam_base.hpp:190
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:66
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:277
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The actual implementation of seqan3::cigar::operation for documentation purposes only....
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:51
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:267
format_bam()=default
Defaulted.
int32_t read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, std::string_view const str, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:871
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:533
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition: format_bam.hpp:802
std::vector< cigar > parse_binary_cigar(std::string_view const cigar_str) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1109
format_bam(format_bam &&)=default
Defaulted.
void read_integral_byte_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:212
void read_integral_byte_field(std::string_view const str, number_type &target)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: format_bam.hpp:219
void read_sam_dict(std::string_view const tag_str, sam_tag_dictionary &target)
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:927
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
void read_float_byte_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:230
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1139
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:143
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:189
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:140
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:167
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:67
Stores the header information of SAM/BAM files.
Definition: header.hpp:49
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:149
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:188
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:185
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
T copy(T... args)
T copy_n(T... args)
T countr_zero(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T end(T... args)
T equal(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
T from_chars(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: alphabet/concept.hpp:524
constexpr auto assign_char_strictly_to
Assign a character to an alphabet object, throw if the character is not valid.
Definition: alphabet/concept.hpp:734
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: io/sam_file/detail/cigar.hpp:121
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:45
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: io/sam_file/detail/cigar.hpp:51
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:42
constexpr std::vector< cigar > parse_cigar(std::string_view const cigar_str)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: io/sam_file/detail/cigar.hpp:90
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly_view.hpp:590
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf_view.hpp:107
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ bit_score
The bit score (statistical significance indicator), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ id
The identifier, usually a string.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition: predicate.hpp:63
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
A type that satisfies std::is_arithmetic_v<t>.
Checks whether from can be implicityly converted to to.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
T memcpy(T... args)
T min(T... args)
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:147
uint32_t bin
The bin number.
Definition: format_bam.hpp:153
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:155
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:154
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:157
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:156
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:149
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:150
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:148
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:151
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:159
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:152
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:158
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
T substr(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T visit(T... args)