1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.fasta;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedReader;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.IOException;
32 import java.io.InputStream;
33 import java.io.InputStreamReader;
34 import java.io.Reader;
35 import java.util.LinkedList;
36 import java.util.zip.GZIPInputStream;
37
38 import org.apache.tools.tar.TarEntry;
39 import org.apache.tools.tar.TarInputStream;
40 import org.utgenome.UTGBErrorCode;
41 import org.utgenome.UTGBException;
42 import org.xerial.util.FileType;
43 import org.xerial.util.log.Logger;
44
45
46
47
48
49
50
51 public class FASTAPullParser {
52 private static Logger _logger = Logger.getLogger(FASTAPullParser.class);
53
54 private static enum TokenType {
55 DescriptionLine, SequenceLine
56 }
57
58 private static class Token {
59 private TokenType type;
60 private String data;
61
62 public Token(TokenType type, String data) {
63 this.type = type;
64 this.data = data.trim();
65 }
66
67 public TokenType getType() {
68 return type;
69 }
70
71 public String getData() {
72 return data;
73 }
74 }
75
76 private LinkedList<Token> tokenStack = new LinkedList<Token>();
77 private final FASTAReader fastaReader;
78 private int lineCount = 0;
79
80 private static int DEFAULT_BUFFER_SIZE = 4 * 1024 * 1024;
81
82
83
84
85
86
87 public FASTAPullParser(Reader reader) {
88 fastaReader = new DefaultFASTAReader(new BufferedReader(reader));
89 }
90
91 public static FASTAPullParser newTARFileReader(InputStream tarFileInput) throws IOException {
92 return new FASTAPullParser(new TarFASTAReader(new BufferedInputStream(tarFileInput, DEFAULT_BUFFER_SIZE), DEFAULT_BUFFER_SIZE));
93 }
94
95 public static FASTAPullParser newTARGZFileReader(InputStream targzInput) throws IOException {
96 return new FASTAPullParser(new TarFASTAReader(new GZIPInputStream(new BufferedInputStream(targzInput)), DEFAULT_BUFFER_SIZE));
97 }
98
99 public static FASTAPullParser newGZIPFileReader(InputStream gzInput) throws IOException {
100 return new FASTAPullParser(new DefaultFASTAReader(new GZIPInputStream(new BufferedInputStream(gzInput)), DEFAULT_BUFFER_SIZE));
101 }
102
103 private FASTAPullParser(FASTAReader reader) {
104 this.fastaReader = reader;
105 }
106
107
108
109
110
111
112
113 public FASTAPullParser(File fastaFile) throws IOException {
114 this(fastaFile.getName(), new FileInputStream(fastaFile), DEFAULT_BUFFER_SIZE);
115 }
116
117 public FASTAPullParser(File fastaFile, int bufferSize) throws IOException {
118 this(fastaFile.getName(), new FileInputStream(fastaFile), bufferSize);
119 }
120
121 protected FASTAPullParser(String fastaFile, InputStream in, int bufferSize) throws IOException {
122 final int BUFFER_SIZE = bufferSize;
123
124 FileType fileType = FileType.getFileType(fastaFile);
125 switch (fileType) {
126 case TAR:
127 fastaReader = new TarFASTAReader(new BufferedInputStream(in, BUFFER_SIZE), BUFFER_SIZE);
128 break;
129 case TAR_GZ:
130 fastaReader = new TarFASTAReader(new GZIPInputStream(new BufferedInputStream(in)), BUFFER_SIZE);
131 break;
132 case GZIP:
133 fastaReader = new DefaultFASTAReader(new GZIPInputStream(new BufferedInputStream(in)), BUFFER_SIZE);
134 break;
135 case FASTA:
136 default:
137 fastaReader = new DefaultFASTAReader(in, BUFFER_SIZE);
138 break;
139 }
140 }
141
142 private static interface FASTAReader {
143 public String nextLine() throws IOException;
144
145 public void close() throws IOException;
146 }
147
148 private static class DefaultFASTAReader implements FASTAReader {
149
150 private BufferedReader in;
151
152 public DefaultFASTAReader(BufferedReader r) {
153 this.in = r;
154 }
155
156 public DefaultFASTAReader(InputStream in, int bufferSize) {
157 this.in = new BufferedReader(new InputStreamReader(in), bufferSize);
158 }
159
160 public String nextLine() throws IOException {
161 return in.readLine();
162 }
163
164 public void close() throws IOException {
165 if (in != null)
166 in.close();
167 }
168 }
169
170 private static class TarFASTAReader implements FASTAReader {
171
172 TarInputStream tarIn;
173 BufferedReader reader = null;
174 int bufferSize;
175
176 public TarFASTAReader(InputStream in, int bufferSize) throws IOException {
177 this.tarIn = new TarInputStream(in);
178 this.bufferSize = bufferSize;
179 }
180
181 public String nextLine() throws IOException {
182
183 if (reader != null) {
184 String line = reader.readLine();
185 if (line == null) {
186 reader = null;
187 return nextLine();
188 }
189 else
190 return line;
191 }
192
193 while (true) {
194 TarEntry currentEntry = tarIn.getNextEntry();
195 if (currentEntry == null)
196 return null;
197
198 if (currentEntry.isDirectory()) {
199 continue;
200 }
201
202 FileType fileType = FileType.getFileType(currentEntry.getName());
203 if (fileType != FileType.FASTA)
204 continue;
205
206 reader = new BufferedReader(new InputStreamReader(tarIn), bufferSize);
207 break;
208 }
209
210 return nextLine();
211 }
212
213 public void close() throws IOException {
214 if (reader != null)
215 reader.close();
216 }
217
218 }
219
220 private boolean hasStackedToken() {
221 return !tokenStack.isEmpty();
222 }
223
224 private Token popToken() {
225 return tokenStack.removeLast();
226 }
227
228 private Token nextToken() throws IOException {
229 if (hasStackedToken())
230 return popToken();
231 else {
232
233 String line = fastaReader.nextLine();
234 if (line == null)
235 return null;
236 lineCount++;
237 if (line.startsWith(">"))
238 return new Token(TokenType.DescriptionLine, line.substring(1));
239 else
240 return new Token(TokenType.SequenceLine, line);
241 }
242 }
243
244
245
246
247
248
249
250
251
252 public FASTASequence nextSequence() throws UTGBException, IOException {
253 Token t = nextToken();
254 if (t == null)
255 return null;
256
257 TokenType type = t.getType();
258 if (type == TokenType.DescriptionLine) {
259 String seq = readSequence();
260 return new FASTASequence(t.getData(), seq);
261 }
262 else
263 return null;
264 }
265
266 public String nextSequenceLine() throws IOException {
267 Token t = nextToken();
268 if (t == null)
269 return null;
270
271 TokenType type = t.getType();
272 if (type == TokenType.SequenceLine) {
273 return t.getData();
274 }
275 else {
276 tokenStack.add(t);
277 return null;
278 }
279 }
280
281 public String nextDescriptionLine() throws IOException {
282 Token t = nextToken();
283 if (t == null)
284 return null;
285 if (t.getType() == TokenType.DescriptionLine) {
286 return t.getData();
287 }
288 else {
289 tokenStack.add(t);
290 return null;
291 }
292 }
293
294 private String readSequence() throws UTGBException, IOException {
295 Token t = nextToken();
296 if (t == null)
297 throw new UTGBException(UTGBErrorCode.INVALID_FORMAT, "sequence is null: " + lineInfo());
298 TokenType type;
299 StringBuilder builder = new StringBuilder();
300 while ((type = t.getType()) == TokenType.SequenceLine) {
301 builder.append(t.getData());
302 t = nextToken();
303 if (t == null) {
304 return builder.toString();
305 }
306 }
307 tokenStack.add(t);
308 return builder.toString();
309
310 }
311
312 private String lineInfo() {
313 return "line=" + lineCount;
314 }
315
316 public void close() throws IOException {
317 if (fastaReader != null)
318 fastaReader.close();
319 }
320
321 }