1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.fasta;
26
27 import java.io.ByteArrayOutputStream;
28 import java.io.IOException;
29
30 import org.utgenome.UTGBErrorCode;
31 import org.utgenome.UTGBException;
32 import org.utgenome.gwt.utgb.client.bio.ACGTEncoder;
33 import org.utgenome.util.kmer.KmerIntegerFactory;
34
35
36
37
38
39
40
41 public class CompactACGT implements GenomeSequence {
42
43 final byte[] sequence;
44 final byte[] sequenceMask;
45 private final int length;
46 private final int offset;
47
48 final static int CODE_SIZE = 2;
49 final static int BYTE = 8;
50
51
52
53
54
55
56
57
58
59
60 CompactACGT(byte[] sequence, byte[] sequenceMask, int length, int offset) throws UTGBException {
61 this.sequence = sequence;
62 this.sequenceMask = sequenceMask;
63
64 this.length = length;
65 this.offset = offset;
66
67
68 if ((this.sequence.length * (BYTE / CODE_SIZE)) - offset < length) {
69 throw new UTGBException(UTGBErrorCode.INVALID_INPUT, String.format("packed array is shorter than the specified length: %d (offset=%d) < %d",
70 sequence.length * (BYTE / CODE_SIZE), offset, length));
71 }
72 if ((this.sequenceMask.length * BYTE - offset) < length) {
73 throw new UTGBException(UTGBErrorCode.INVALID_INPUT, String.format("invalid mask binary: buf size=%d (offset=%d), length=%d", sequenceMask.length,
74 offset, length));
75 }
76 }
77
78 class KmerIterator implements OverlappingKmerIterator {
79 private final int K;
80 private final KmerIntegerFactory f;
81 private int start;
82
83 public KmerIterator(int K) {
84 this.K = K;
85 this.f = new KmerIntegerFactory(K);
86 int start = offset;
87 }
88
89 public CompactACGT nextKmer() {
90
91 if (start + K > length)
92 return null;
93
94 try {
95 CompactACGT kmer = new CompactACGT(sequence, sequenceMask, K, CompactACGT.this.offset + start++);
96 return kmer;
97 }
98 catch (UTGBException e) {
99 throw new IllegalStateException(e);
100 }
101 }
102
103 }
104
105
106
107
108
109
110
111
112
113
114 public CompactACGT subSequence(int start, int length) throws UTGBException {
115 return new CompactACGT(sequence, sequenceMask, length, start + this.offset);
116 }
117
118 public CompactACGT reverseComplement() throws UTGBException {
119 ByteArrayOutputStream seqOut = new ByteArrayOutputStream(sequence.length);
120 ByteArrayOutputStream nSeqOut = new ByteArrayOutputStream(sequenceMask.length);
121 try {
122
123 CompactACGTWriter w = new CompactACGTWriter(seqOut, nSeqOut);
124
125 for (int i = length - 1; i >= 0; i--) {
126 int c = code2bitAt(i);
127 c = ~c & 0x03;
128 w.append2bit((byte) c);
129 }
130 w.close();
131 return new CompactACGT(seqOut.toByteArray(), nSeqOut.toByteArray(), length, 0);
132 }
133 catch (IOException e) {
134 throw UTGBException.convert(e);
135 }
136 }
137
138 public OverlappingKmerIterator getKmerIterator(int K) {
139 return new KmerIterator(K);
140 }
141
142 public static CompactACGT createFromString(String seq) throws IOException, UTGBException {
143
144 ByteArrayOutputStream seqOut = new ByteArrayOutputStream();
145 ByteArrayOutputStream nSeqOut = new ByteArrayOutputStream();
146 try {
147 CompactACGTWriter w = new CompactACGTWriter(seqOut, nSeqOut);
148 w.append(seq);
149 w.close();
150 return new CompactACGT(seqOut.toByteArray(), nSeqOut.toByteArray(), seq.length(), 0);
151 }
152 catch (IOException e) {
153 throw UTGBException.convert(e);
154 }
155
156 }
157
158 public int length() {
159 return length;
160 }
161
162 public char charAt(int index) {
163 return ACGTEncoder.toBase(code2bitAt(index));
164 }
165
166
167
168
169
170
171
172 int code2bitAt(int index) {
173 int x = index + offset;
174
175 int maskPos = x / 8;
176 int maskOffset = x % 8;
177 if ((sequenceMask[maskPos] >>> (7 - maskOffset) & 0x01) != 0)
178 return -1;
179
180 int bPos = x / 4;
181 int bOffset = x % 4;
182
183 return (sequence[bPos] >>> (6 - bOffset * 2)) & 0x03;
184 }
185
186 @Override
187 public String toString() {
188 StringBuilder buf = new StringBuilder();
189 for (int i = 0; i < length; i++) {
190 buf.append(charAt(i));
191 }
192 return buf.toString();
193 }
194
195 }