1 /*-------------------------------------------------------------------------- 2 * Copyright 2010 utgenome.org 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 *--------------------------------------------------------------------------*/ 16 //-------------------------------------- 17 // utgb-core Project 18 // 19 // FastqReader.java 20 // Since: Jul 2, 2010 21 // 22 //-------------------------------------- 23 package org.utgenome.format.fastq; 24 25 import java.io.BufferedReader; 26 import java.io.IOException; 27 import java.io.Reader; 28 29 import org.utgenome.UTGBException; 30 31 /** 32 * FASTQ format file reader. 33 * 34 * 35 * <h1>FASTQ Format Specification: http://maq.sourceforge.net/fastq.shtml</h1> 36 * 37 * <h2>Introduction</h2> 38 * 39 * <p> 40 * FASTQ format stores sequences and Phred qualities in a single file. It is concise and compact. FASTQ is first widely 41 * used in the Sanger Institute and therefore we usually take the Sanger specification and the standard FASTQ format, or 42 * simply FASTQ format. Although Solexa/Illumina read file looks pretty much like FASTQ, they are different in that the 43 * qualities are scaled differently. In the quality string, if you can see a character with its ASCII code higher than 44 * 90, probably your file is in the Solexa/Illumina format. 45 * </p> 46 * 47 * <pre> 48 * Example 49 * 50 * @EAS54_6_R1_2_1_413_324 51 * CCCTTCTTGTCTTCAGCGTTTCTCC 52 * + 53 * ;;3;;;;;;;;;;;;7;;;;;;;88 54 * @EAS54_6_R1_2_1_540_792 55 * TTGGCAGGCCAAGGCCGATGGATCA 56 * + 57 * ;;;;;;;;;;;7;;;;;-;;;3;83 58 * @EAS54_6_R1_2_1_443_348 59 * GTTGCTTCTGGCGTGGGTGGGGGGG 60 * +EAS54_6_R1_2_1_443_348 61 * ;;;;;;;;;;;9;7;;.7;393333 62 * </pre> 63 * 64 * <h2>FASTQ Format Specification</h2> 65 * 66 * <h3>Notations</h3> 67 * 68 * <pre> 69 * <fastq>, <blocks> and so on represents non-terminal symbols. 70 * Characters in red are regex-like operators. 71 * '\n' stands for the Return key. 72 * Syntax 73 * 74 * <fastq> := <block>+ 75 * <block> := @<seqname>\n<seq>\n"+"[<seqname>]?\n<qual>\n 76 * <seqname> := [A-Za-z0-9_.:-]+ 77 * <seq> := [A-Za-z\n\.~]+ 78 * <qual> := [!-~\n]+ 79 * </pre> 80 * 81 * <h2>Requirements</h2> 82 * 83 * The <seqname> following '+' is optional, but if it appears right after '+', it should be identical to the <seqname> 84 * following '@'. The length of <seq> is identical the length of <qual>. Each character in <qual> represents the phred 85 * quality of the corresponding nucleotide in <seq>. If the Phred quality is $Q, which is a non-negative integer, the 86 * corresponding quality character can be calculated with the following Perl code: $q = chr(($Q<=93? $Q : 93) + 33); 87 * where chr() is the Perl function to convert an integer to a character based on the ASCII table. Conversely, given a 88 * character $q, the corresponding Phred quality can be calculated with: $Q = ord($q) - 33; where ord() gives the ASCII 89 * code of a character. Solexa/Illumina Read Format 90 * 91 * The syntax of Solexa/Illumina read format is almost identical to the FASTQ format, but the qualities are scaled 92 * differently. Given a character $sq, the following Perl code gives the Phred quality $Q: 93 * 94 * $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10); 95 * 96 * @author leo 97 * 98 */ 99 public class FastqReader { 100 101 BufferedReader reader; 102 103 public FastqReader(Reader input) { 104 if (!BufferedReader.class.isInstance(input)) 105 this.reader = new BufferedReader(input); 106 else 107 this.reader = BufferedReader.class.cast(input); 108 } 109 110 /** 111 * Read the next FASTQ read entry. 112 * 113 * @return FastqRead entry or null if the end of stream has reached 114 * @throws UTGBException 115 */ 116 public FastqRead next() throws UTGBException { 117 return FastqRead.parse(reader); 118 } 119 120 public void close() throws IOException { 121 reader.close(); 122 } 123 124 }