1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.fasta;
26
27 import java.io.BufferedReader;
28 import java.io.File;
29 import java.io.FileInputStream;
30 import java.io.FileNotFoundException;
31 import java.io.FileReader;
32 import java.io.IOException;
33 import java.io.RandomAccessFile;
34 import java.util.Collections;
35 import java.util.LinkedHashMap;
36 import java.util.Set;
37
38 import org.utgenome.UTGBException;
39 import org.xerial.core.XerialException;
40
41
42
43
44
45
46
47 public class CompactFASTA {
48
49 public final static String PAC_FILE_SUFFIX = ".pac";
50 public final static String PAC_N_FILE_SUFFIX = ".pacn";
51 public final static String PAC_INDEX_FILE_SUFFIX = ".i.silk";
52
53 private final LinkedHashMap<String, CompactFASTAIndex> indexTable = new LinkedHashMap<String, CompactFASTAIndex>();
54
55 private final String fastaFilePrefix;
56 private final PacFileAccess access;
57
58
59
60
61
62
63 public CompactFASTA(String fastaFile) throws XerialException, IOException {
64 this(fastaFile, false);
65 }
66
67 private CompactFASTA(String fastaFile, boolean loadIntoMemory) throws XerialException, IOException {
68 this.fastaFilePrefix = fastaFile;
69
70 File indexFile = new File(fastaFilePrefix + PAC_INDEX_FILE_SUFFIX);
71 for (CompactFASTAIndex each : CompactFASTAIndex.load(new BufferedReader(new FileReader(indexFile)))) {
72 indexTable.put(each.name, each);
73 }
74
75 File pacFile = new File(fastaFilePrefix + PAC_FILE_SUFFIX);
76 File pacNFile = new File(fastaFilePrefix + PAC_N_FILE_SUFFIX);
77 if (!loadIntoMemory)
78 access = new OnDiskAccess(pacFile, pacNFile);
79 else
80 access = new OnMemoryBuffer(pacFile, pacNFile);
81 }
82
83
84
85
86
87
88 public Set<String> getChrSet() {
89 return Collections.unmodifiableSet(indexTable.keySet());
90 }
91
92
93
94
95
96
97
98 public boolean containsChr(String chr) {
99 return indexTable.containsKey(chr);
100 }
101
102 public static CompactFASTA loadIntoMemory(String fastaFilePrefix) throws XerialException, IOException {
103 return new CompactFASTA(fastaFilePrefix, true);
104 }
105
106 public void close() throws IOException {
107 access.close();
108 }
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 public CompactACGT getSequence(String chr, int start, int end) throws IOException, UTGBException {
124 if (!indexTable.containsKey(chr))
125 return null;
126 CompactFASTAIndex index = indexTable.get(chr);
127 return getSequence(index, start, end);
128 }
129
130 CompactACGT getSequence(CompactFASTAIndex index, int start, int end) throws IOException, UTGBException {
131 if (index == null)
132 throw new IllegalArgumentException("index must not be null");
133
134 if (start > end) {
135 int tmp = end;
136 end = start;
137 start = tmp;
138 }
139
140 int length = end - start;
141 if (length > index.length)
142 length = (index.length - start);
143
144 long bStart = start + index.offset;
145 long bEnd = bStart + length;
146 long pac_lowerBound = bStart / 4;
147 long pac_upperBound = bEnd / 4 + (bEnd % 4 != 0 ? 1 : 0);
148 long pacN_lowerBound = bStart / 8;
149 long pacN_upperBound = bEnd / 8 + (bEnd % 8 != 0 ? 1 : 0);
150
151
152
153 byte[] seqBuf = new byte[(int) (pac_upperBound - pac_lowerBound)];
154 byte[] seqNBuf = new byte[(int) (pacN_upperBound - pacN_lowerBound)];
155
156 access.readSeq(pac_lowerBound, seqBuf);
157 access.readNSeq(pacN_lowerBound, seqNBuf);
158 return new CompactACGT(seqBuf, seqNBuf, length, (int) bStart % 4);
159
160 }
161
162 public GenomeSequence getSequence(String chr, int start) throws IOException, UTGBException {
163 if (!indexTable.containsKey(chr))
164 return null;
165 CompactFASTAIndex index = indexTable.get(chr);
166 return getSequence(index, start, index.length);
167 }
168
169 public GenomeSequence getSequence(String chr) throws IOException, UTGBException {
170 return getSequence(chr, 0);
171 }
172
173 public static String pickSequenceName(String descriptionLine) {
174 int begin = 0;
175 if (descriptionLine.length() > 0 && descriptionLine.charAt(0) == '>')
176 begin++;
177
178
179 for (; begin < descriptionLine.length(); ++begin) {
180 char c = descriptionLine.charAt(begin);
181 if (!(c == ' ' | c == '\t'))
182 break;
183 }
184 int end = begin + 1;
185 for (; end < descriptionLine.length(); ++end) {
186 char c = descriptionLine.charAt(end);
187 if (c == ' ' | c == '\t') {
188 break;
189 }
190 }
191 return descriptionLine.substring(begin, end);
192 }
193
194 public interface PacFileAccess {
195 public void readSeq(long cursor, byte[] buf) throws IOException;
196
197 public void readNSeq(long cursor, byte[] buf) throws IOException;
198
199 public void close() throws IOException;
200 }
201
202 public class OnDiskAccess implements PacFileAccess {
203 private final RandomAccessFile packedFASTA;
204 private final RandomAccessFile packedFASTA_N;
205
206 public OnDiskAccess(File pacFile, File pacNFile) throws FileNotFoundException {
207 packedFASTA = new RandomAccessFile(pacFile, "r");
208 packedFASTA_N = new RandomAccessFile(pacNFile, "r");
209 }
210
211 public void close() throws IOException {
212 if (packedFASTA != null)
213 packedFASTA.close();
214 if (packedFASTA_N != null)
215 packedFASTA_N.close();
216 }
217
218 public void readNSeq(long cursor, byte[] buf) throws IOException {
219 packedFASTA_N.seek(cursor);
220 packedFASTA_N.read(buf);
221 }
222
223 public void readSeq(long cursor, byte[] buf) throws IOException {
224 packedFASTA.seek(cursor);
225 packedFASTA.read(buf);
226 }
227
228 }
229
230 public class OnMemoryBuffer implements PacFileAccess {
231 byte[] pac;
232 byte[] nPac;
233
234 public OnMemoryBuffer(File pacFile, File pacNFile) throws IOException {
235 long pacSize = pacFile.length();
236 long nPacSize = pacNFile.length();
237
238
239 pac = new byte[(int) pacSize];
240 nPac = new byte[(int) nPacSize];
241
242
243 FileInputStream f = new FileInputStream(pacFile);
244 f.read(pac);
245 f.close();
246
247 FileInputStream fn = new FileInputStream(pacNFile);
248 fn.read(nPac);
249 fn.close();
250 }
251
252 public void readSeq(long cursor, byte[] buf) throws IOException {
253 System.arraycopy(pac, (int) cursor, buf, 0, buf.length);
254 }
255
256 public void readNSeq(long cursor, byte[] buf) throws IOException {
257 System.arraycopy(nPac, (int) cursor, buf, 0, buf.length);
258 }
259
260 public void close() throws IOException {
261
262 }
263
264 }
265
266 }