View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2007 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // UTGB Common Project
18  //
19  // FASTAPullParser.java
20  // Since: Jun 4, 2007
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.format.fasta;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedReader;
29  import java.io.File;
30  import java.io.FileInputStream;
31  import java.io.IOException;
32  import java.io.InputStream;
33  import java.io.InputStreamReader;
34  import java.io.Reader;
35  import java.util.LinkedList;
36  import java.util.zip.GZIPInputStream;
37  
38  import org.apache.tools.tar.TarEntry;
39  import org.apache.tools.tar.TarInputStream;
40  import org.utgenome.UTGBErrorCode;
41  import org.utgenome.UTGBException;
42  import org.xerial.util.FileType;
43  import org.xerial.util.log.Logger;
44  
45  /**
46   * A pull parser for FASTA format files
47   * 
48   * @author leo
49   * 
50   */
51  public class FASTAPullParser {
52  	private static Logger _logger = Logger.getLogger(FASTAPullParser.class);
53  
54  	private static enum TokenType {
55  		DescriptionLine, SequenceLine
56  	}
57  
58  	private static class Token {
59  		private TokenType type;
60  		private String data;
61  
62  		public Token(TokenType type, String data) {
63  			this.type = type;
64  			this.data = data.trim();
65  		}
66  
67  		public TokenType getType() {
68  			return type;
69  		}
70  
71  		public String getData() {
72  			return data;
73  		}
74  	}
75  
76  	private LinkedList<Token> tokenStack = new LinkedList<Token>();
77  	private final FASTAReader fastaReader;
78  	private int lineCount = 0;
79  
80  	private static int DEFAULT_BUFFER_SIZE = 4 * 1024 * 1024;
81  
82  	/**
83  	 * Create a pull parser from a given text reader
84  	 * 
85  	 * @param reader
86  	 */
87  	public FASTAPullParser(Reader reader) {
88  		fastaReader = new DefaultFASTAReader(new BufferedReader(reader));
89  	}
90  
91  	public static FASTAPullParser newTARFileReader(InputStream tarFileInput) throws IOException {
92  		return new FASTAPullParser(new TarFASTAReader(new BufferedInputStream(tarFileInput, DEFAULT_BUFFER_SIZE), DEFAULT_BUFFER_SIZE));
93  	}
94  
95  	public static FASTAPullParser newTARGZFileReader(InputStream targzInput) throws IOException {
96  		return new FASTAPullParser(new TarFASTAReader(new GZIPInputStream(new BufferedInputStream(targzInput)), DEFAULT_BUFFER_SIZE));
97  	}
98  
99  	public static FASTAPullParser newGZIPFileReader(InputStream gzInput) throws IOException {
100 		return new FASTAPullParser(new DefaultFASTAReader(new GZIPInputStream(new BufferedInputStream(gzInput)), DEFAULT_BUFFER_SIZE));
101 	}
102 
103 	private FASTAPullParser(FASTAReader reader) {
104 		this.fastaReader = reader;
105 	}
106 
107 	/**
108 	 * Create a pull parse for the given FASTA file. The file can be text (.fa, .fasta, etc.), tar.gz or gz format.
109 	 * 
110 	 * @param fastaFile
111 	 * @throws IOException
112 	 */
113 	public FASTAPullParser(File fastaFile) throws IOException {
114 		this(fastaFile.getName(), new FileInputStream(fastaFile), DEFAULT_BUFFER_SIZE);
115 	}
116 
117 	public FASTAPullParser(File fastaFile, int bufferSize) throws IOException {
118 		this(fastaFile.getName(), new FileInputStream(fastaFile), bufferSize);
119 	}
120 
121 	protected FASTAPullParser(String fastaFile, InputStream in, int bufferSize) throws IOException {
122 		final int BUFFER_SIZE = bufferSize;
123 
124 		FileType fileType = FileType.getFileType(fastaFile);
125 		switch (fileType) {
126 		case TAR:
127 			fastaReader = new TarFASTAReader(new BufferedInputStream(in, BUFFER_SIZE), BUFFER_SIZE);
128 			break;
129 		case TAR_GZ:
130 			fastaReader = new TarFASTAReader(new GZIPInputStream(new BufferedInputStream(in)), BUFFER_SIZE);
131 			break;
132 		case GZIP:
133 			fastaReader = new DefaultFASTAReader(new GZIPInputStream(new BufferedInputStream(in)), BUFFER_SIZE);
134 			break;
135 		case FASTA:
136 		default:
137 			fastaReader = new DefaultFASTAReader(in, BUFFER_SIZE);
138 			break;
139 		}
140 	}
141 
142 	private static interface FASTAReader {
143 		public String nextLine() throws IOException;
144 
145 		public void close() throws IOException;
146 	}
147 
148 	private static class DefaultFASTAReader implements FASTAReader {
149 
150 		private BufferedReader in;
151 
152 		public DefaultFASTAReader(BufferedReader r) {
153 			this.in = r;
154 		}
155 
156 		public DefaultFASTAReader(InputStream in, int bufferSize) {
157 			this.in = new BufferedReader(new InputStreamReader(in), bufferSize);
158 		}
159 
160 		public String nextLine() throws IOException {
161 			return in.readLine();
162 		}
163 
164 		public void close() throws IOException {
165 			if (in != null)
166 				in.close();
167 		}
168 	}
169 
170 	private static class TarFASTAReader implements FASTAReader {
171 
172 		TarInputStream tarIn;
173 		BufferedReader reader = null;
174 		int bufferSize;
175 
176 		public TarFASTAReader(InputStream in, int bufferSize) throws IOException {
177 			this.tarIn = new TarInputStream(in);
178 			this.bufferSize = bufferSize;
179 		}
180 
181 		public String nextLine() throws IOException {
182 
183 			if (reader != null) {
184 				String line = reader.readLine();
185 				if (line == null) {
186 					reader = null;
187 					return nextLine();
188 				}
189 				else
190 					return line;
191 			}
192 
193 			while (true) {
194 				TarEntry currentEntry = tarIn.getNextEntry();
195 				if (currentEntry == null)
196 					return null;
197 
198 				if (currentEntry.isDirectory()) {
199 					continue;
200 				}
201 
202 				FileType fileType = FileType.getFileType(currentEntry.getName());
203 				if (fileType != FileType.FASTA)
204 					continue;
205 
206 				reader = new BufferedReader(new InputStreamReader(tarIn), bufferSize);
207 				break;
208 			}
209 
210 			return nextLine();
211 		}
212 
213 		public void close() throws IOException {
214 			if (reader != null)
215 				reader.close();
216 		}
217 
218 	}
219 
220 	private boolean hasStackedToken() {
221 		return !tokenStack.isEmpty();
222 	}
223 
224 	private Token popToken() {
225 		return tokenStack.removeLast();
226 	}
227 
228 	private Token nextToken() throws IOException {
229 		if (hasStackedToken())
230 			return popToken();
231 		else {
232 			// read next line
233 			String line = fastaReader.nextLine();
234 			if (line == null)
235 				return null; // no more token
236 			lineCount++;
237 			if (line.startsWith(">"))
238 				return new Token(TokenType.DescriptionLine, line.substring(1));
239 			else
240 				return new Token(TokenType.SequenceLine, line);
241 		}
242 	}
243 
244 	/**
245 	 * read the next fasta sequence;
246 	 * 
247 	 * @return the next fasta sequence, or null when there is no more sequence to read.
248 	 * @throws InvalidFormatException
249 	 *             when the input fasta data format is invalid
250 	 * @throws IOException
251 	 */
252 	public FASTASequence nextSequence() throws UTGBException, IOException {
253 		Token t = nextToken();
254 		if (t == null)
255 			return null;
256 
257 		TokenType type = t.getType();
258 		if (type == TokenType.DescriptionLine) {
259 			String seq = readSequence();
260 			return new FASTASequence(t.getData(), seq);
261 		}
262 		else
263 			return null;
264 	}
265 
266 	public String nextSequenceLine() throws IOException {
267 		Token t = nextToken();
268 		if (t == null)
269 			return null;
270 
271 		TokenType type = t.getType();
272 		if (type == TokenType.SequenceLine) {
273 			return t.getData();
274 		}
275 		else {
276 			tokenStack.add(t);
277 			return null;
278 		}
279 	}
280 
281 	public String nextDescriptionLine() throws IOException {
282 		Token t = nextToken();
283 		if (t == null)
284 			return null;
285 		if (t.getType() == TokenType.DescriptionLine) {
286 			return t.getData();
287 		}
288 		else {
289 			tokenStack.add(t);
290 			return null;
291 		}
292 	}
293 
294 	private String readSequence() throws UTGBException, IOException {
295 		Token t = nextToken();
296 		if (t == null)
297 			throw new UTGBException(UTGBErrorCode.INVALID_FORMAT, "sequence is null: " + lineInfo());
298 		TokenType type;
299 		StringBuilder builder = new StringBuilder();
300 		while ((type = t.getType()) == TokenType.SequenceLine) {
301 			builder.append(t.getData());
302 			t = nextToken();
303 			if (t == null) {
304 				return builder.toString();
305 			}
306 		}
307 		tokenStack.add(t);
308 		return builder.toString();
309 
310 	}
311 
312 	private String lineInfo() {
313 		return "line=" + lineCount;
314 	}
315 
316 	public void close() throws IOException {
317 		if (fastaReader != null)
318 			fastaReader.close();
319 	}
320 
321 }