View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2010 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // FastqReader.java
20  // Since: Jul 2, 2010
21  //
22  //--------------------------------------
23  package org.utgenome.format.fastq;
24  
25  import java.io.BufferedReader;
26  import java.io.IOException;
27  import java.io.Reader;
28  
29  import org.utgenome.UTGBException;
30  
31  /**
32   * FASTQ format file reader.
33   * 
34   * 
35   * <h1>FASTQ Format Specification: http://maq.sourceforge.net/fastq.shtml</h1>
36   * 
37   * <h2>Introduction</h2>
38   * 
39   * <p>
40   * FASTQ format stores sequences and Phred qualities in a single file. It is concise and compact. FASTQ is first widely
41   * used in the Sanger Institute and therefore we usually take the Sanger specification and the standard FASTQ format, or
42   * simply FASTQ format. Although Solexa/Illumina read file looks pretty much like FASTQ, they are different in that the
43   * qualities are scaled differently. In the quality string, if you can see a character with its ASCII code higher than
44   * 90, probably your file is in the Solexa/Illumina format.
45   * </p>
46   * 
47   * <pre>
48   * Example
49   * 
50   * @EAS54_6_R1_2_1_413_324
51   * CCCTTCTTGTCTTCAGCGTTTCTCC
52   * +
53   * ;;3;;;;;;;;;;;;7;;;;;;;88
54   * @EAS54_6_R1_2_1_540_792
55   * TTGGCAGGCCAAGGCCGATGGATCA
56   * +
57   * ;;;;;;;;;;;7;;;;;-;;;3;83
58   * @EAS54_6_R1_2_1_443_348
59   * GTTGCTTCTGGCGTGGGTGGGGGGG
60   * +EAS54_6_R1_2_1_443_348
61   * ;;;;;;;;;;;9;7;;.7;393333
62   * </pre>
63   * 
64   * <h2>FASTQ Format Specification</h2>
65   * 
66   * <h3>Notations</h3>
67   * 
68   * <pre>
69   * <fastq>, <blocks> and so on represents non-terminal symbols.
70   * Characters in red are regex-like operators.
71   * '\n' stands for the Return key.
72   * Syntax
73   * 
74   * <fastq>	:=	<block>+
75   * <block>	:=	@<seqname>\n<seq>\n"+"[<seqname>]?\n<qual>\n
76   * <seqname>	:=	[A-Za-z0-9_.:-]+
77   * <seq>	:=	[A-Za-z\n\.~]+
78   * <qual>	:=	[!-~\n]+
79   * </pre>
80   * 
81   * <h2>Requirements</h2>
82   * 
83   * The <seqname> following '+' is optional, but if it appears right after '+', it should be identical to the <seqname>
84   * following '@'. The length of <seq> is identical the length of <qual>. Each character in <qual> represents the phred
85   * quality of the corresponding nucleotide in <seq>. If the Phred quality is $Q, which is a non-negative integer, the
86   * corresponding quality character can be calculated with the following Perl code: $q = chr(($Q<=93? $Q : 93) + 33);
87   * where chr() is the Perl function to convert an integer to a character based on the ASCII table. Conversely, given a
88   * character $q, the corresponding Phred quality can be calculated with: $Q = ord($q) - 33; where ord() gives the ASCII
89   * code of a character. Solexa/Illumina Read Format
90   * 
91   * The syntax of Solexa/Illumina read format is almost identical to the FASTQ format, but the qualities are scaled
92   * differently. Given a character $sq, the following Perl code gives the Phred quality $Q:
93   * 
94   * $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10);
95   * 
96   * @author leo
97   * 
98   */
99  public class FastqReader {
100 
101 	BufferedReader reader;
102 
103 	public FastqReader(Reader input) {
104 		if (!BufferedReader.class.isInstance(input))
105 			this.reader = new BufferedReader(input);
106 		else
107 			this.reader = BufferedReader.class.cast(input);
108 	}
109 
110 	/**
111 	 * Read the next FASTQ read entry.
112 	 * 
113 	 * @return FastqRead entry or null if the end of stream has reached
114 	 * @throws UTGBException
115 	 */
116 	public FastqRead next() throws UTGBException {
117 		return FastqRead.parse(reader);
118 	}
119 
120 	public void close() throws IOException {
121 		reader.close();
122 	}
123 
124 }