View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // SmithWatermanAlignment.java
20  // Since: Feb 22, 2010
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.util.aligner;
26  
27  import org.utgenome.format.fasta.GenomeSequence;
28  
29  /**
30   * Simple Smith-Waterman based aligner
31   * 
32   * @author leo
33   * 
34   */
35  public class SmithWatermanAligner {
36  
37  	public static class Alignment {
38  		public final String cigar;
39  		public final int score;
40  		public final int pos; // 1-based leftmost position of the clipped sequence
41  		public final String rseq; // reference sequence
42  		public final String qseq; // query sequence
43  
44  		public Alignment(String cigar, int score, String rseq, int pos, String qseq) {
45  			this.cigar = cigar;
46  			this.score = score;
47  			this.pos = pos;
48  			this.rseq = rseq;
49  			this.qseq = qseq;
50  		}
51  	}
52  
53  	private enum Trace {
54  		NONE, DIAGONAL, LEFT, UP
55  	};
56  
57  	public static class Config {
58  		public int MATCH_SCORE = 1;
59  		public int MISMATCH_PENALTY = 3;
60  		public int GAPOPEN_PENALTY = 5;
61  		/**
62  		 * for BSS alignment
63  		 */
64  		public boolean BSS_ALIGNMENT = true;
65  		public int BSS_CT_MISMATCH_PENALTY = 0;
66  	}
67  
68  	private final Config config;
69  
70  	public SmithWatermanAligner() {
71  		this(new Config());
72  	}
73  
74  	public SmithWatermanAligner(Config config) {
75  		this.config = config;
76  	}
77  
78  	public static class StringWrapper implements GenomeSequence {
79  
80  		public final String seq;
81  
82  		public StringWrapper(String seq) {
83  			this.seq = seq;
84  		}
85  
86  		public char charAt(int index) {
87  			return seq.charAt(index);
88  		}
89  
90  		public int length() {
91  			return seq.length();
92  		}
93  
94  	}
95  
96  	/**
97  	 * Wrap the input string as {@link GenomeSequence}
98  	 * 
99  	 * @param seq
100 	 * @return
101 	 */
102 	public static GenomeSequence wrap(String seq) {
103 		return new StringWrapper(seq);
104 	}
105 
106 	public Alignment align(String seq1, String seq2) {
107 		return align(wrap(seq1), wrap(seq2));
108 	}
109 
110 	public <Seq1 extends GenomeSequence, Seq2 extends GenomeSequence> Alignment align(Seq1 seq1, Seq2 seq2) {
111 
112 		final int N = seq1.length() + 1;
113 		final int M = seq2.length() + 1;
114 
115 		// prepare the score matrix
116 		int score[][] = new int[N][M];
117 		Trace trace[][] = new Trace[N][M];
118 
119 		score[0][0] = 0;
120 		trace[0][0] = Trace.NONE;
121 
122 		// set the first row and column.  
123 		for (int x = 1; x < N; ++x) {
124 			score[x][0] = 0;
125 			trace[x][0] = Trace.NONE;
126 		}
127 		for (int y = 1; y < M; ++y) {
128 			score[0][y] = 0;
129 			trace[0][y] = Trace.NONE;
130 		}
131 
132 		// maximum score and its location 
133 		int maxScore = 0;
134 		int maxX = 0;
135 		int maxY = 0;
136 
137 		// SW loop
138 		for (int x = 1; x < N; ++x) {
139 			for (int y = 1; y < M; ++y) {
140 				char c1 = Character.toLowerCase(seq1.charAt(x - 1));
141 				char c2 = Character.toLowerCase(seq2.charAt(y - 1));
142 
143 				// match(S), insertion(I) to , deletion(D) from the seq1 scores
144 				int S, I, D;
145 				if (c1 == c2)
146 					S = score[x - 1][y - 1] + config.MATCH_SCORE;
147 				else {
148 					if (config.BSS_ALIGNMENT && (c1 == 'c' && c2 == 't')) {
149 						S = score[x - 1][y - 1] - config.BSS_CT_MISMATCH_PENALTY;
150 					}
151 					else
152 						S = score[x - 1][y - 1] - config.MISMATCH_PENALTY;
153 				}
154 
155 				I = score[x][y - 1] - config.GAPOPEN_PENALTY;
156 				D = score[x - 1][y] - config.GAPOPEN_PENALTY;
157 
158 				if (S <= 0 && I <= 0 && D <= 0) {
159 					score[x][y] = 0;
160 					trace[x][y] = Trace.NONE;
161 					continue;
162 				}
163 
164 				// choose the best score
165 				if (S >= I) {
166 					if (S >= D) {
167 						score[x][y] = S;
168 						trace[x][y] = Trace.DIAGONAL;
169 					}
170 					else {
171 						score[x][y] = D;
172 						trace[x][y] = Trace.LEFT;
173 					}
174 				}
175 				else if (I >= D) {
176 					score[x][y] = I;
177 					trace[x][y] = Trace.UP;
178 				}
179 				else {
180 					score[x][y] = D;
181 					trace[x][y] = Trace.LEFT;
182 				}
183 
184 				// update max score
185 				if (score[x][y] > maxScore) {
186 					maxX = x;
187 					maxY = y;
188 					maxScore = score[x][y];
189 				}
190 			}
191 		}
192 
193 		// trace back
194 		StringBuilder cigar = new StringBuilder();
195 		StringBuilder a1 = new StringBuilder();
196 		StringBuilder a2 = new StringBuilder();
197 
198 		int leftMostPos = 0; // for seq1 
199 
200 		for (int i = M - 1; i > maxY; --i) {
201 			cigar.append("S");
202 		}
203 		boolean toContinue = true;
204 
205 		int x = N - 1;
206 		int y = M - 1;
207 
208 		while (x > maxX) {
209 			a1.append(seq1.charAt(x - 1));
210 			x--;
211 		}
212 		while (y > maxY) {
213 			a2.append(Character.toLowerCase(seq2.charAt(y - 1)));
214 			y--;
215 		}
216 
217 		for (x = maxX, y = maxY; toContinue;) {
218 			switch (trace[x][y]) {
219 			case DIAGONAL:
220 				cigar.append("M");
221 				a1.append(seq1.charAt(x - 1));
222 				a2.append(seq2.charAt(y - 1));
223 				leftMostPos = x - 1;
224 				x--;
225 				y--;
226 				break;
227 			case LEFT:
228 				cigar.append("D");
229 				a1.append("-");
230 				a2.append(seq2.charAt(y - 1));
231 				leftMostPos = x - 1;
232 				x--;
233 				break;
234 			case UP:
235 				cigar.append("I");
236 				a1.append(seq1.charAt(x - 1));
237 				a2.append("-");
238 				y--;
239 				break;
240 			case NONE:
241 				toContinue = false;
242 				while (x >= 1 || y >= 1) {
243 					if (y >= 1) {
244 						cigar.append("S");
245 						a1.append(x >= 1 ? seq1.charAt(x - 1) : ' ');
246 						a2.append(Character.toLowerCase(seq2.charAt(y - 1)));
247 					}
248 					else {
249 						a1.append(x >= 1 ? seq1.charAt(x - 1) : ' ');
250 						a2.append(' ');
251 					}
252 					x--;
253 					y--;
254 				}
255 
256 				break;
257 			}
258 		}
259 
260 		// create cigar string
261 		String cigarStr = cigar.reverse().toString();
262 		char prev = cigarStr.charAt(0);
263 		int count = 1;
264 		StringBuilder compactCigar = new StringBuilder();
265 		for (int i = 1; i < cigarStr.length(); ++i) {
266 			char c = cigarStr.charAt(i);
267 			if (prev == c) {
268 				count++;
269 			}
270 			else {
271 				compactCigar.append(Integer.toString(count));
272 				compactCigar.append(prev);
273 
274 				prev = c;
275 				count = 1;
276 			}
277 		}
278 		if (count > 0) {
279 			compactCigar.append(Integer.toString(count));
280 			compactCigar.append(prev);
281 		}
282 
283 		return new Alignment(compactCigar.toString(), maxScore, a1.reverse().toString(), leftMostPos, a2.reverse().toString());
284 	}
285 
286 }