1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.utgenome.format.fasta;
24
25 import org.utgenome.gwt.utgb.client.bio.ACGTEncoder;
26
27
28
29
30
31
32
33
34
35
36 public class Kmer implements GenomeSequence {
37
38 private byte[] sequence2bit;
39 private int size;
40
41 private Kmer(byte[] sequence2bit, int size) {
42 this.sequence2bit = sequence2bit;
43 this.size = size;
44 }
45
46 public Kmer(Kmer other) {
47 this.sequence2bit = new byte[other.sequence2bit.length];
48 this.size = other.size;
49
50 for (int i = 0; i < sequence2bit.length; ++i) {
51 sequence2bit[i] = other.sequence2bit[i];
52 }
53 }
54
55 public Kmer(CompactACGT acgt) {
56 this.size = acgt.length();
57 sequence2bit = new byte[acgt.sequence.length];
58 for (int i = 0; i < sequence2bit.length; ++i) {
59 sequence2bit[i] = 0;
60 }
61
62 for (int i = 0; i < acgt.length(); ++i) {
63
64 set(i, acgt.charAt(i));
65 }
66 }
67
68 public Kmer(String acgt) {
69 this.size = acgt.length();
70 sequence2bit = new byte[this.size / 4 + (this.size % 4 == 0 ? 0 : 1)];
71 for (int i = 0; i < sequence2bit.length; ++i) {
72 sequence2bit[i] = 0;
73 }
74 for (int i = 0; i < acgt.length(); ++i) {
75 set(i, acgt.charAt(i));
76 }
77 }
78
79 public int toInt() {
80 return toInt(size);
81 }
82
83 public int toInt(int firstKmer) {
84 if (firstKmer > 30)
85 throw new IllegalArgumentException("The size must be <= 30: K = " + firstKmer);
86 if (firstKmer > size) {
87 throw new IllegalArgumentException("K must be less than or equal to " + size);
88 }
89
90 int kmerInteger = 0;
91 final int lastIndex = firstKmer / 4 + (firstKmer % 4 == 0 ? 0 : 1);
92 final int K = firstKmer;
93 for (int i = 0; i < lastIndex; i++) {
94 if (i > 0) {
95 kmerInteger <<= 8;
96 }
97
98 int shiftSize = 0;
99 if (i == lastIndex - 1) {
100 int offset = K % 4;
101 if (offset > 0)
102 shiftSize = 8 - offset * 2;
103 }
104 int mask = 0xFF;
105 mask >>>= shiftSize;
106
107 kmerInteger |= (sequence2bit[i] >>> shiftSize) & mask;
108 }
109
110 return kmerInteger;
111 }
112
113 public int length() {
114 return size;
115 }
116
117 public char charAt(int index) {
118 final int offset = index % 4;
119 byte code = (byte) (sequence2bit[index / 4] >>> (6 - offset * 2) & 0x03);
120 return ACGTEncoder.toBase(code);
121 }
122
123 public void set(int index, char acgt) {
124 setACGT(sequence2bit, index, acgt);
125 }
126
127 public void set(int index, String acgt) {
128 set(index, acgt.charAt(0));
129 }
130
131 public Kmer delete(int deletePos, int deleteLen) {
132
133 if (deletePos < 0)
134 throw new IllegalArgumentException("deletePos must be >= 0");
135
136 if (deletePos + deleteLen > size) {
137 deleteLen = size - deletePos;
138 }
139 final int newSize = size - deleteLen;
140
141
142 byte[] new2bitArray = new byte[newSize / 4 + (newSize % 4 == 0 ? 0 : 1)];
143 for (int i = 0; i < new2bitArray.length; ++i)
144 new2bitArray[i] = 0;
145
146
147 final int prefixBlockSize = deletePos / 4 + (deletePos % 4 == 0 ? 0 : 1);
148 for (int i = 0; i < prefixBlockSize; i++) {
149
150 int mask = 0xFF;
151 if (i == prefixBlockSize - 1) {
152 int offset = deletePos % 4;
153 if (offset > 0) {
154 int shiftSize = 8 - offset * 2;
155 mask >>>= shiftSize;
156 mask <<= shiftSize;
157 }
158 }
159
160 new2bitArray[i] = (byte) (sequence2bit[i] & mask);
161 }
162
163
164 final int suffixStart = deletePos + deleteLen;
165 for (int i = suffixStart; i < size; ++i) {
166 setACGT(new2bitArray, i - deleteLen, charAt(i));
167 }
168
169 this.sequence2bit = new2bitArray;
170 this.size = newSize;
171
172 return this;
173 }
174
175 public Kmer append(String acgt) {
176 return insert(this.size, acgt);
177 }
178
179 public Kmer insert(int insertPos, String acgt) {
180
181 if (insertPos < 0)
182 throw new IllegalArgumentException("insertPos must be >= 0");
183
184 final int insertLen = acgt.length();
185 final int newSize = size + insertLen;
186
187
188 byte[] new2bitArray = new byte[newSize / 4 + (newSize % 4 == 0 ? 0 : 1)];
189 for (int i = 0; i < new2bitArray.length; ++i)
190 new2bitArray[i] = 0;
191
192
193 final int prefixBlockSize = insertPos / 4 + (insertPos % 4 == 0 ? 0 : 1);
194 for (int i = 0; i < prefixBlockSize; i++) {
195
196 int mask = 0xFF;
197 if (i == prefixBlockSize - 1) {
198 int offset = insertPos % 4;
199 if (offset > 0) {
200 int shiftSize = 8 - offset * 2;
201 mask >>>= shiftSize;
202 mask <<= shiftSize;
203 }
204 }
205
206 new2bitArray[i] = (byte) (sequence2bit[i] & mask);
207 }
208
209
210 for (int i = 0; i < acgt.length(); i++) {
211 final int blockPos = (insertPos + i) / 4;
212 final int blockOffset = (insertPos + i) % 4;
213 setACGT(new2bitArray, insertPos + i, acgt.charAt(i));
214 }
215
216
217 for (int i = insertPos; i < size; ++i) {
218 setACGT(new2bitArray, i + insertLen, charAt(i));
219 }
220
221 this.sequence2bit = new2bitArray;
222 this.size = newSize;
223 return this;
224 }
225
226 private static void setACGT(byte[] array, int index, char acgt) {
227 int code = ACGTEncoder.to2bitCode(acgt) & 0x03;
228 int shiftSize = 6 - (index % 4) * 2;
229
230 final int blockIndex = index / 4;
231
232 array[blockIndex] &= ~(0x03 << shiftSize);
233 array[blockIndex] |= (byte) (code << shiftSize);
234 }
235
236 private static void setACGTCode(byte[] array, int index, int code) {
237 array[index / 4] |= (byte) ((code & 0x03) << (6 - (index % 4) * 2));
238 }
239
240 public Kmer reverseComplement() {
241 byte[] rc = new byte[sequence2bit.length];
242
243 int rIndex = 0;
244 for (int i = size - 1; i >= 0; --i, rIndex++) {
245 final int offset = i % 4;
246 int code = (sequence2bit[i / 4] >>> (6 - offset * 2)) & 0x03;
247 setACGTCode(rc, rIndex, ~code);
248 }
249
250 return new Kmer(rc, size);
251 }
252
253 @Override
254 public String toString() {
255 StringBuilder s = new StringBuilder();
256 for (int i = 0; i < size; i++) {
257 s.append(this.charAt(i));
258 }
259 return s.toString();
260
261 }
262
263 @Override
264 public boolean equals(Object obj) {
265 if (!Kmer.class.isInstance(obj))
266 return false;
267
268 Kmer other = Kmer.class.cast(obj);
269
270 if (this.size != other.size)
271 return false;
272
273 for (int i = 0; i < this.sequence2bit.length; ++i) {
274 if (this.sequence2bit[i] != other.sequence2bit[i])
275 return false;
276 }
277
278 return true;
279 }
280
281 @Override
282 public int hashCode() {
283
284 int h = 3;
285 for (int i = 0; i < this.sequence2bit.length; ++i)
286 h += sequence2bit[i] * 1997;
287
288 return h;
289 }
290
291 }