View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2010 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // FastqToBAM.java
20  // Since: Jul 5, 2010
21  //
22  //--------------------------------------
23  package org.utgenome.format.fastq;
24  
25  import java.io.BufferedReader;
26  import java.io.File;
27  import java.io.FileInputStream;
28  import java.io.FileReader;
29  import java.io.IOException;
30  import java.io.InputStreamReader;
31  import java.io.Reader;
32  import java.util.zip.GZIPInputStream;
33  
34  import net.sf.samtools.SAMFileHeader;
35  import net.sf.samtools.SAMFileWriter;
36  import net.sf.samtools.SAMFileWriterFactory;
37  import net.sf.samtools.SAMReadGroupRecord;
38  import net.sf.samtools.SAMRecord;
39  
40  import org.utgenome.UTGBErrorCode;
41  import org.utgenome.UTGBException;
42  import org.xerial.util.log.Logger;
43  import org.xerial.util.opt.Argument;
44  import org.xerial.util.opt.Option;
45  import org.xerial.util.opt.OptionParser;
46  import org.xerial.util.opt.OptionParserException;
47  
48  /**
49   * Converting Illumina's FASTQ read data (single or paired-end) into BAM format, which can be used BLOAD's GATK
50   * pipeline.
51   * 
52   * This code is migrated from
53   * 
54   * @author leo
55   * 
56   */
57  public class FastqToBAM {
58  
59  	private static Logger _logger = Logger.getLogger(FastqToBAM.class);
60  
61  	@Option(longName = "readGroup", description = "read group name")
62  	private String readGroupName;
63  
64  	@Option(longName = "sample", description = "sample name")
65  	private String sampleName;
66  
67  	@Option(longName = "prefix", description = "prefix of the read")
68  	private String readPrefix;
69  
70  	@Argument(index = 0, name = "input fastq (.fastq, .fastq.gz)", required = true)
71  	private File input1;
72  	@Argument(index = 1, name = "input fastq (.fastq, .fastq.gz, when paried-end read)")
73  	private File input2;
74  
75  	@Option(symbol = "o", longName = "output", description = "output file name (.sam or .bam)")
76  	private File outputFile;
77  
78  	@Option(longName = "tmpdir", description = "temporary directory.for storing merge sort runs. default = System.getProperty(\"java.io.tmpdir\"));")
79  	private File tmpDir = new File(System.getProperty("java.io.tmpdir"));
80  
81  	public static int execute(String[] args) throws Exception {
82  
83  		FastqToBAM main = new FastqToBAM();
84  		OptionParser parser = new OptionParser(main);
85  		try {
86  			parser.parse(args);
87  
88  			main.convert();
89  		}
90  		catch (OptionParserException e) {
91  			_logger.error(e);
92  			return 1;
93  		}
94  
95  		return 0;
96  
97  	}
98  
99  	public int convert() throws UTGBException, IOException {
100 		if (input1 == null) {
101 			throw new UTGBException(UTGBErrorCode.MISSING_OPTION, "missing fastq file");
102 		}
103 
104 		Reader in1;
105 		if (input1.getName().endsWith(".gz")) {
106 			in1 = new InputStreamReader(new GZIPInputStream(new FileInputStream(input1)));
107 		}
108 		else
109 			in1 = new FileReader(input1);
110 
111 		Reader in2 = null;
112 		if (input2 != null) {
113 			if (input2.getName().endsWith(".gz"))
114 				in2 = new InputStreamReader(new GZIPInputStream(new FileInputStream(input2)));
115 			else
116 				in2 = new FileReader(input2);
117 		}
118 
119 		return convert(new BufferedReader(in1), in2 == null ? null : new BufferedReader(in2));
120 	}
121 
122 	public int convert(Reader input1, Reader input2) throws UTGBException, IOException {
123 
124 		if (!tmpDir.exists()) {
125 			_logger.info("create dir: " + tmpDir.getPath());
126 			tmpDir.mkdirs();
127 		}
128 		// set the temporary directory for storing merge sort runs. 
129 		// Picard library will look up this java.io.tmpdir system property value. 
130 		System.setProperty("java.io.tmpdir", tmpDir.getAbsolutePath());
131 
132 		FastqReader end1 = new FastqReader(input1);
133 		FastqReader end2 = (input2 == null) ? null : new FastqReader(input2);
134 
135 		SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(readGroupName);
136 		SAMFileHeader samHeader = new SAMFileHeader();
137 		if (readGroupName != null) {
138 			samHeader.addReadGroup(readGroupRecord);
139 		}
140 		readGroupRecord.setSample(sampleName);
141 		samHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
142 
143 		if (outputFile == null)
144 			throw new UTGBException(UTGBErrorCode.MISSING_OPTION, "no output file is specified by -o option");
145 
146 		SAMFileWriter sfw = (new SAMFileWriterFactory()).makeSAMOrBAMWriter(samHeader, false, outputFile);
147 		int readsSeen = 0;
148 		FastqRead fqr1 = null, fqr2 = null;
149 		try {
150 			for (; (fqr1 = end1.next()) != null && (end2 == null || (fqr2 = end2.next()) != null);) {
151 
152 				String fqr1Name = fqr1.seqname;
153 
154 				SAMRecord sr1 = new SAMRecord(samHeader);
155 				sr1.setReadName(readPrefix != null ? (readPrefix + ":" + fqr1Name) : fqr1Name);
156 				sr1.setReadString(fqr1.seq);
157 				sr1.setBaseQualityString(fqr1.qual);
158 				sr1.setReadUnmappedFlag(true);
159 				sr1.setReadPairedFlag(false);
160 				sr1.setAttribute("RG", readGroupName);
161 
162 				SAMRecord sr2 = null;
163 
164 				// paired-end read
165 				if (fqr2 != null) {
166 					sr1.setReadPairedFlag(true);
167 					sr1.setFirstOfPairFlag(true);
168 					sr1.setSecondOfPairFlag(false);
169 					sr1.setMateUnmappedFlag(true);
170 
171 					String fqr2Name = fqr2.seqname;
172 					sr2 = new SAMRecord(samHeader);
173 					sr2.setReadName(readPrefix != null ? (readPrefix + ":" + fqr2Name) : fqr2Name);
174 					sr2.setReadString(fqr2.seq);
175 					sr2.setBaseQualityString(fqr2.qual);
176 					sr2.setReadUnmappedFlag(true);
177 					sr2.setReadPairedFlag(true);
178 					sr2.setAttribute("RG", readGroupName);
179 					sr2.setFirstOfPairFlag(false);
180 					sr2.setSecondOfPairFlag(true);
181 					sr2.setMateUnmappedFlag(true);
182 				}
183 
184 				sfw.addAlignment(sr1);
185 				if (fqr2 != null) {
186 					sfw.addAlignment(sr2);
187 				}
188 				readsSeen++;
189 
190 				if (readsSeen > 0 && (readsSeen % 1000000) == 0) {
191 					_logger.info(String.format("%d (paired) reads has been processed.", readsSeen));
192 				}
193 			}
194 		}
195 		catch (Exception e) {
196 			_logger.error(String.format("error found when reading %d-th read: %s\n%s%s", readsSeen, e.getMessage(), fqr1 != null ? "fq1:\n" + fqr1.toSilk()
197 					: "", fqr2 != null ? "fq2:\n" + fqr2.toSilk() : ""));
198 			throw UTGBException.convert(e);
199 		}
200 		finally {
201 			if (end1 != null)
202 				end1.close();
203 			if (end2 != null)
204 				end2.close();
205 
206 			if (sfw != null)
207 				sfw.close();
208 		}
209 
210 		return readsSeen;
211 	}
212 }