1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.util.aligner;
26
27 import org.utgenome.format.fasta.GenomeSequence;
28
29
30
31
32
33
34
35 public class SmithWatermanAligner {
36
37 public static class Alignment {
38 public final String cigar;
39 public final int score;
40 public final int pos;
41 public final String rseq;
42 public final String qseq;
43
44 public Alignment(String cigar, int score, String rseq, int pos, String qseq) {
45 this.cigar = cigar;
46 this.score = score;
47 this.pos = pos;
48 this.rseq = rseq;
49 this.qseq = qseq;
50 }
51 }
52
53 private enum Trace {
54 NONE, DIAGONAL, LEFT, UP
55 };
56
57 public static class Config {
58 public int MATCH_SCORE = 1;
59 public int MISMATCH_PENALTY = 3;
60 public int GAPOPEN_PENALTY = 5;
61
62
63
64 public boolean BSS_ALIGNMENT = true;
65 public int BSS_CT_MISMATCH_PENALTY = 0;
66 }
67
68 private final Config config;
69
70 public SmithWatermanAligner() {
71 this(new Config());
72 }
73
74 public SmithWatermanAligner(Config config) {
75 this.config = config;
76 }
77
78 public static class StringWrapper implements GenomeSequence {
79
80 public final String seq;
81
82 public StringWrapper(String seq) {
83 this.seq = seq;
84 }
85
86 public char charAt(int index) {
87 return seq.charAt(index);
88 }
89
90 public int length() {
91 return seq.length();
92 }
93
94 }
95
96
97
98
99
100
101
102 public static GenomeSequence wrap(String seq) {
103 return new StringWrapper(seq);
104 }
105
106 public Alignment align(String seq1, String seq2) {
107 return align(wrap(seq1), wrap(seq2));
108 }
109
110 public <Seq1 extends GenomeSequence, Seq2 extends GenomeSequence> Alignment align(Seq1 seq1, Seq2 seq2) {
111
112 final int N = seq1.length() + 1;
113 final int M = seq2.length() + 1;
114
115
116 int score[][] = new int[N][M];
117 Trace trace[][] = new Trace[N][M];
118
119 score[0][0] = 0;
120 trace[0][0] = Trace.NONE;
121
122
123 for (int x = 1; x < N; ++x) {
124 score[x][0] = 0;
125 trace[x][0] = Trace.NONE;
126 }
127 for (int y = 1; y < M; ++y) {
128 score[0][y] = 0;
129 trace[0][y] = Trace.NONE;
130 }
131
132
133 int maxScore = 0;
134 int maxX = 0;
135 int maxY = 0;
136
137
138 for (int x = 1; x < N; ++x) {
139 for (int y = 1; y < M; ++y) {
140 char c1 = Character.toLowerCase(seq1.charAt(x - 1));
141 char c2 = Character.toLowerCase(seq2.charAt(y - 1));
142
143
144 int S, I, D;
145 if (c1 == c2)
146 S = score[x - 1][y - 1] + config.MATCH_SCORE;
147 else {
148 if (config.BSS_ALIGNMENT && (c1 == 'c' && c2 == 't')) {
149 S = score[x - 1][y - 1] - config.BSS_CT_MISMATCH_PENALTY;
150 }
151 else
152 S = score[x - 1][y - 1] - config.MISMATCH_PENALTY;
153 }
154
155 I = score[x][y - 1] - config.GAPOPEN_PENALTY;
156 D = score[x - 1][y] - config.GAPOPEN_PENALTY;
157
158 if (S <= 0 && I <= 0 && D <= 0) {
159 score[x][y] = 0;
160 trace[x][y] = Trace.NONE;
161 continue;
162 }
163
164
165 if (S >= I) {
166 if (S >= D) {
167 score[x][y] = S;
168 trace[x][y] = Trace.DIAGONAL;
169 }
170 else {
171 score[x][y] = D;
172 trace[x][y] = Trace.LEFT;
173 }
174 }
175 else if (I >= D) {
176 score[x][y] = I;
177 trace[x][y] = Trace.UP;
178 }
179 else {
180 score[x][y] = D;
181 trace[x][y] = Trace.LEFT;
182 }
183
184
185 if (score[x][y] > maxScore) {
186 maxX = x;
187 maxY = y;
188 maxScore = score[x][y];
189 }
190 }
191 }
192
193
194 StringBuilder cigar = new StringBuilder();
195 StringBuilder a1 = new StringBuilder();
196 StringBuilder a2 = new StringBuilder();
197
198 int leftMostPos = 0;
199
200 for (int i = M - 1; i > maxY; --i) {
201 cigar.append("S");
202 }
203 boolean toContinue = true;
204
205 int x = N - 1;
206 int y = M - 1;
207
208 while (x > maxX) {
209 a1.append(seq1.charAt(x - 1));
210 x--;
211 }
212 while (y > maxY) {
213 a2.append(Character.toLowerCase(seq2.charAt(y - 1)));
214 y--;
215 }
216
217 for (x = maxX, y = maxY; toContinue;) {
218 switch (trace[x][y]) {
219 case DIAGONAL:
220 cigar.append("M");
221 a1.append(seq1.charAt(x - 1));
222 a2.append(seq2.charAt(y - 1));
223 leftMostPos = x - 1;
224 x--;
225 y--;
226 break;
227 case LEFT:
228 cigar.append("D");
229 a1.append("-");
230 a2.append(seq2.charAt(y - 1));
231 leftMostPos = x - 1;
232 x--;
233 break;
234 case UP:
235 cigar.append("I");
236 a1.append(seq1.charAt(x - 1));
237 a2.append("-");
238 y--;
239 break;
240 case NONE:
241 toContinue = false;
242 while (x >= 1 || y >= 1) {
243 if (y >= 1) {
244 cigar.append("S");
245 a1.append(x >= 1 ? seq1.charAt(x - 1) : ' ');
246 a2.append(Character.toLowerCase(seq2.charAt(y - 1)));
247 }
248 else {
249 a1.append(x >= 1 ? seq1.charAt(x - 1) : ' ');
250 a2.append(' ');
251 }
252 x--;
253 y--;
254 }
255
256 break;
257 }
258 }
259
260
261 String cigarStr = cigar.reverse().toString();
262 char prev = cigarStr.charAt(0);
263 int count = 1;
264 StringBuilder compactCigar = new StringBuilder();
265 for (int i = 1; i < cigarStr.length(); ++i) {
266 char c = cigarStr.charAt(i);
267 if (prev == c) {
268 count++;
269 }
270 else {
271 compactCigar.append(Integer.toString(count));
272 compactCigar.append(prev);
273
274 prev = c;
275 count = 1;
276 }
277 }
278 if (count > 0) {
279 compactCigar.append(Integer.toString(count));
280 compactCigar.append(prev);
281 }
282
283 return new Alignment(compactCigar.toString(), maxScore, a1.reverse().toString(), leftMostPos, a2.reverse().toString());
284 }
285
286 }