View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2010 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // Kmer.java
20  // Since: 2010/11/10
21  //
22  //--------------------------------------
23  package org.utgenome.format.fasta;
24  
25  import org.utgenome.gwt.utgb.client.bio.ACGTEncoder;
26  
27  /**
28   * K-mer integer representation of ACGT.
29   * 
30   * The difference with {@link CompactACGT} is this K-mer representation is modifiable to facilitate genome sequence
31   * editing.
32   * 
33   * @author leo
34   * 
35   */
36  public class Kmer implements GenomeSequence {
37  
38  	private byte[] sequence2bit;
39  	private int size;
40  
41  	private Kmer(byte[] sequence2bit, int size) {
42  		this.sequence2bit = sequence2bit;
43  		this.size = size;
44  	}
45  
46  	public Kmer(Kmer other) {
47  		this.sequence2bit = new byte[other.sequence2bit.length];
48  		this.size = other.size;
49  		// copy
50  		for (int i = 0; i < sequence2bit.length; ++i) {
51  			sequence2bit[i] = other.sequence2bit[i];
52  		}
53  	}
54  
55  	public Kmer(CompactACGT acgt) {
56  		this.size = acgt.length();
57  		sequence2bit = new byte[acgt.sequence.length];
58  		for (int i = 0; i < sequence2bit.length; ++i) {
59  			sequence2bit[i] = 0;
60  		}
61  
62  		for (int i = 0; i < acgt.length(); ++i) {
63  			// TODO byte copy
64  			set(i, acgt.charAt(i));
65  		}
66  	}
67  
68  	public Kmer(String acgt) {
69  		this.size = acgt.length();
70  		sequence2bit = new byte[this.size / 4 + (this.size % 4 == 0 ? 0 : 1)];
71  		for (int i = 0; i < sequence2bit.length; ++i) {
72  			sequence2bit[i] = 0;
73  		}
74  		for (int i = 0; i < acgt.length(); ++i) {
75  			set(i, acgt.charAt(i));
76  		}
77  	}
78  
79  	public int toInt() {
80  		return toInt(size);
81  	}
82  
83  	public int toInt(int firstKmer) {
84  		if (firstKmer > 30)
85  			throw new IllegalArgumentException("The size must be <= 30: K = " + firstKmer);
86  		if (firstKmer > size) {
87  			throw new IllegalArgumentException("K must be less than or equal to " + size);
88  		}
89  
90  		int kmerInteger = 0;
91  		final int lastIndex = firstKmer / 4 + (firstKmer % 4 == 0 ? 0 : 1);
92  		final int K = firstKmer;
93  		for (int i = 0; i < lastIndex; i++) {
94  			if (i > 0) {
95  				kmerInteger <<= 8;
96  			}
97  
98  			int shiftSize = 0;
99  			if (i == lastIndex - 1) {
100 				int offset = K % 4;
101 				if (offset > 0)
102 					shiftSize = 8 - offset * 2;
103 			}
104 			int mask = 0xFF;
105 			mask >>>= shiftSize;
106 
107 			kmerInteger |= (sequence2bit[i] >>> shiftSize) & mask;
108 		}
109 
110 		return kmerInteger;
111 	}
112 
113 	public int length() {
114 		return size;
115 	}
116 
117 	public char charAt(int index) {
118 		final int offset = index % 4;
119 		byte code = (byte) (sequence2bit[index / 4] >>> (6 - offset * 2) & 0x03);
120 		return ACGTEncoder.toBase(code);
121 	}
122 
123 	public void set(int index, char acgt) {
124 		setACGT(sequence2bit, index, acgt);
125 	}
126 
127 	public void set(int index, String acgt) {
128 		set(index, acgt.charAt(0));
129 	}
130 
131 	public Kmer delete(int deletePos, int deleteLen) {
132 
133 		if (deletePos < 0)
134 			throw new IllegalArgumentException("deletePos must be >= 0");
135 
136 		if (deletePos + deleteLen > size) {
137 			deleteLen = size - deletePos;
138 		}
139 		final int newSize = size - deleteLen;
140 
141 		// prepare the new array for the inserted sequence 
142 		byte[] new2bitArray = new byte[newSize / 4 + (newSize % 4 == 0 ? 0 : 1)];
143 		for (int i = 0; i < new2bitArray.length; ++i)
144 			new2bitArray[i] = 0;
145 
146 		// paste the prefix
147 		final int prefixBlockSize = deletePos / 4 + (deletePos % 4 == 0 ? 0 : 1);
148 		for (int i = 0; i < prefixBlockSize; i++) {
149 
150 			int mask = 0xFF;
151 			if (i == prefixBlockSize - 1) {
152 				int offset = deletePos % 4;
153 				if (offset > 0) {
154 					int shiftSize = 8 - offset * 2;
155 					mask >>>= shiftSize;
156 					mask <<= shiftSize;
157 				}
158 			}
159 
160 			new2bitArray[i] = (byte) (sequence2bit[i] & mask);
161 		}
162 
163 		// paste the suffix
164 		final int suffixStart = deletePos + deleteLen;
165 		for (int i = suffixStart; i < size; ++i) {
166 			setACGT(new2bitArray, i - deleteLen, charAt(i));
167 		}
168 
169 		this.sequence2bit = new2bitArray;
170 		this.size = newSize;
171 
172 		return this;
173 	}
174 
175 	public Kmer append(String acgt) {
176 		return insert(this.size, acgt);
177 	}
178 
179 	public Kmer insert(int insertPos, String acgt) {
180 
181 		if (insertPos < 0)
182 			throw new IllegalArgumentException("insertPos must be >= 0");
183 
184 		final int insertLen = acgt.length();
185 		final int newSize = size + insertLen;
186 
187 		// prepare the new array for the inserted sequence 
188 		byte[] new2bitArray = new byte[newSize / 4 + (newSize % 4 == 0 ? 0 : 1)];
189 		for (int i = 0; i < new2bitArray.length; ++i)
190 			new2bitArray[i] = 0;
191 
192 		// paste the prefix
193 		final int prefixBlockSize = insertPos / 4 + (insertPos % 4 == 0 ? 0 : 1);
194 		for (int i = 0; i < prefixBlockSize; i++) {
195 
196 			int mask = 0xFF;
197 			if (i == prefixBlockSize - 1) {
198 				int offset = insertPos % 4;
199 				if (offset > 0) {
200 					int shiftSize = 8 - offset * 2;
201 					mask >>>= shiftSize;
202 					mask <<= shiftSize;
203 				}
204 			}
205 
206 			new2bitArray[i] = (byte) (sequence2bit[i] & mask);
207 		}
208 
209 		// paste the inserted sequence
210 		for (int i = 0; i < acgt.length(); i++) {
211 			final int blockPos = (insertPos + i) / 4;
212 			final int blockOffset = (insertPos + i) % 4;
213 			setACGT(new2bitArray, insertPos + i, acgt.charAt(i));
214 		}
215 
216 		// paste the suffix
217 		for (int i = insertPos; i < size; ++i) {
218 			setACGT(new2bitArray, i + insertLen, charAt(i));
219 		}
220 
221 		this.sequence2bit = new2bitArray;
222 		this.size = newSize;
223 		return this;
224 	}
225 
226 	private static void setACGT(byte[] array, int index, char acgt) {
227 		int code = ACGTEncoder.to2bitCode(acgt) & 0x03;
228 		int shiftSize = 6 - (index % 4) * 2;
229 
230 		final int blockIndex = index / 4;
231 
232 		array[blockIndex] &= ~(0x03 << shiftSize);
233 		array[blockIndex] |= (byte) (code << shiftSize);
234 	}
235 
236 	private static void setACGTCode(byte[] array, int index, int code) {
237 		array[index / 4] |= (byte) ((code & 0x03) << (6 - (index % 4) * 2));
238 	}
239 
240 	public Kmer reverseComplement() {
241 		byte[] rc = new byte[sequence2bit.length];
242 
243 		int rIndex = 0;
244 		for (int i = size - 1; i >= 0; --i, rIndex++) {
245 			final int offset = i % 4;
246 			int code = (sequence2bit[i / 4] >>> (6 - offset * 2)) & 0x03;
247 			setACGTCode(rc, rIndex, ~code);
248 		}
249 
250 		return new Kmer(rc, size);
251 	}
252 
253 	@Override
254 	public String toString() {
255 		StringBuilder s = new StringBuilder();
256 		for (int i = 0; i < size; i++) {
257 			s.append(this.charAt(i));
258 		}
259 		return s.toString();
260 
261 	}
262 
263 	@Override
264 	public boolean equals(Object obj) {
265 		if (!Kmer.class.isInstance(obj))
266 			return false;
267 
268 		Kmer other = Kmer.class.cast(obj);
269 
270 		if (this.size != other.size)
271 			return false;
272 
273 		for (int i = 0; i < this.sequence2bit.length; ++i) {
274 			if (this.sequence2bit[i] != other.sequence2bit[i])
275 				return false;
276 		}
277 
278 		return true;
279 	}
280 
281 	@Override
282 	public int hashCode() {
283 
284 		int h = 3;
285 		for (int i = 0; i < this.sequence2bit.length; ++i)
286 			h += sequence2bit[i] * 1997;
287 
288 		return h;
289 	}
290 
291 }