aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/googleapis/google/genomics/v1/readalignment.proto
blob: c505584c54f36337dfe23a8be3cffd1c17d33dc8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.genomics.v1;

import "google/api/annotations.proto";
import "google/genomics/v1/cigar.proto";
import "google/genomics/v1/position.proto";
import "google/protobuf/struct.proto";

option cc_enable_arenas = true;
option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
option java_multiple_files = true;
option java_outer_classname = "ReadAlignmentProto";
option java_package = "com.google.genomics.v1";


// A linear alignment can be represented by one CIGAR string. Describes the
// mapped position and local alignment of the read to the reference.
message LinearAlignment {
  // The position of this alignment.
  Position position = 1;

  // The mapping quality of this alignment. Represents how likely
  // the read maps to this position as opposed to other locations.
  //
  // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
  // the nearest integer.
  int32 mapping_quality = 2;

  // Represents the local alignment of this sequence (alignment matches, indels,
  // etc) against the reference.
  repeated CigarUnit cigar = 3;
}

// A read alignment describes a linear alignment of a string of DNA to a
// [reference sequence][google.genomics.v1.Reference], in addition to metadata
// about the fragment (the molecule of DNA sequenced) and the read (the bases
// which were read by the sequencer). A read is equivalent to a line in a SAM
// file. A read belongs to exactly one read group and exactly one
// [read group set][google.genomics.v1.ReadGroupSet].
//
// For more genomics resource definitions, see [Fundamentals of Google
// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
//
// ### Reverse-stranded reads
//
// Mapped reads (reads having a non-null `alignment`) can be aligned to either
// the forward or the reverse strand of their associated reference. Strandedness
// of a mapped read is encoded by `alignment.position.reverseStrand`.
//
// If we consider the reference to be a forward-stranded coordinate space of
// `[0, reference.length)` with `0` as the left-most position and
// `reference.length` as the right-most position, reads are always aligned left
// to right. That is, `alignment.position.position` always refers to the
// left-most reference coordinate and `alignment.cigar` describes the alignment
// of this read to the reference from left to right. All per-base fields such as
// `alignedSequence` and `alignedQuality` share this same left-to-right
// orientation; this is true of reads which are aligned to either strand. For
// reverse-stranded reads, this means that `alignedSequence` is the reverse
// complement of the bases that were originally reported by the sequencing
// machine.
//
// ### Generating a reference-aligned sequence string
//
// When interacting with mapped reads, it's often useful to produce a string
// representing the local alignment of the read to reference. The following
// pseudocode demonstrates one way of doing this:
//
//     out = ""
//     offset = 0
//     for c in read.alignment.cigar {
//       switch c.operation {
//       case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
//         out += read.alignedSequence[offset:offset+c.operationLength]
//         offset += c.operationLength
//         break
//       case "CLIP_SOFT", "INSERT":
//         offset += c.operationLength
//         break
//       case "PAD":
//         out += repeat("*", c.operationLength)
//         break
//       case "DELETE":
//         out += repeat("-", c.operationLength)
//         break
//       case "SKIP":
//         out += repeat(" ", c.operationLength)
//         break
//       case "CLIP_HARD":
//         break
//       }
//     }
//     return out
//
// ### Converting to SAM's CIGAR string
//
// The following pseudocode generates a SAM CIGAR string from the
// `cigar` field. Note that this is a lossy conversion
// (`cigar.referenceSequence` is lost).
//
//     cigarMap = {
//       "ALIGNMENT_MATCH": "M",
//       "INSERT": "I",
//       "DELETE": "D",
//       "SKIP": "N",
//       "CLIP_SOFT": "S",
//       "CLIP_HARD": "H",
//       "PAD": "P",
//       "SEQUENCE_MATCH": "=",
//       "SEQUENCE_MISMATCH": "X",
//     }
//     cigarStr = ""
//     for c in read.alignment.cigar {
//       cigarStr += c.operationLength + cigarMap[c.operation]
//     }
//     return cigarStr
message Read {
  // The server-generated read ID, unique across all reads. This is different
  // from the `fragmentName`.
  string id = 1;

  // The ID of the read group this read belongs to. A read belongs to exactly
  // one read group. This is a server-generated ID which is distinct from SAM's
  // RG tag (for that value, see
  // [ReadGroup.name][google.genomics.v1.ReadGroup.name]).
  string read_group_id = 2;

  // The ID of the read group set this read belongs to. A read belongs to
  // exactly one read group set.
  string read_group_set_id = 3;

  // The fragment name. Equivalent to QNAME (query template name) in SAM.
  string fragment_name = 4;

  // The orientation and the distance between reads from the fragment are
  // consistent with the sequencing protocol (SAM flag 0x2).
  bool proper_placement = 5;

  // The fragment is a PCR or optical duplicate (SAM flag 0x400).
  bool duplicate_fragment = 6;

  // The observed length of the fragment, equivalent to TLEN in SAM.
  int32 fragment_length = 7;

  // The read number in sequencing. 0-based and less than numberReads. This
  // field replaces SAM flag 0x40 and 0x80.
  int32 read_number = 8;

  // The number of reads in the fragment (extension to SAM flag 0x1).
  int32 number_reads = 9;

  // Whether this read did not pass filters, such as platform or vendor quality
  // controls (SAM flag 0x200).
  bool failed_vendor_quality_checks = 10;

  // The linear alignment for this alignment record. This field is null for
  // unmapped reads.
  LinearAlignment alignment = 11;

  // Whether this alignment is secondary. Equivalent to SAM flag 0x100.
  // A secondary alignment represents an alternative to the primary alignment
  // for this read. Aligners may return secondary alignments if a read can map
  // ambiguously to multiple coordinates in the genome. By convention, each read
  // has one and only one alignment where both `secondaryAlignment`
  // and `supplementaryAlignment` are false.
  bool secondary_alignment = 12;

  // Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
  // Supplementary alignments are used in the representation of a chimeric
  // alignment. In a chimeric alignment, a read is split into multiple
  // linear alignments that map to different reference contigs. The first
  // linear alignment in the read will be designated as the representative
  // alignment; the remaining linear alignments will be designated as
  // supplementary alignments. These alignments may have different mapping
  // quality scores. In each linear alignment in a chimeric alignment, the read
  // will be hard clipped. The `alignedSequence` and
  // `alignedQuality` fields in the alignment record will only
  // represent the bases for its respective linear alignment.
  bool supplementary_alignment = 13;

  // The bases of the read sequence contained in this alignment record,
  // **without CIGAR operations applied** (equivalent to SEQ in SAM).
  // `alignedSequence` and `alignedQuality` may be
  // shorter than the full read sequence and quality. This will occur if the
  // alignment is part of a chimeric alignment, or if the read was trimmed. When
  // this occurs, the CIGAR for this read will begin/end with a hard clip
  // operator that will indicate the length of the excised sequence.
  string aligned_sequence = 14;

  // The quality of the read sequence contained in this alignment record
  // (equivalent to QUAL in SAM).
  // `alignedSequence` and `alignedQuality` may be shorter than the full read
  // sequence and quality. This will occur if the alignment is part of a
  // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
  // for this read will begin/end with a hard clip operator that will indicate
  // the length of the excised sequence.
  repeated int32 aligned_quality = 15;

  // The mapping of the primary alignment of the
  // `(readNumber+1)%numberReads` read in the fragment. It replaces
  // mate position and mate strand in SAM.
  Position next_mate_position = 16;

  // A map of additional read alignment information. This must be of the form
  // map<string, string[]> (string key mapping to a list of string values).
  map<string, google.protobuf.ListValue> info = 17;
}