1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.utgenome.format.fastq;
24
25 import java.io.BufferedReader;
26 import java.io.File;
27 import java.io.FileInputStream;
28 import java.io.FileReader;
29 import java.io.IOException;
30 import java.io.InputStreamReader;
31 import java.io.Reader;
32 import java.util.zip.GZIPInputStream;
33
34 import net.sf.samtools.SAMFileHeader;
35 import net.sf.samtools.SAMFileWriter;
36 import net.sf.samtools.SAMFileWriterFactory;
37 import net.sf.samtools.SAMReadGroupRecord;
38 import net.sf.samtools.SAMRecord;
39
40 import org.utgenome.UTGBErrorCode;
41 import org.utgenome.UTGBException;
42 import org.xerial.util.log.Logger;
43 import org.xerial.util.opt.Argument;
44 import org.xerial.util.opt.Option;
45 import org.xerial.util.opt.OptionParser;
46 import org.xerial.util.opt.OptionParserException;
47
48
49
50
51
52
53
54
55
56
57 public class FastqToBAM {
58
59 private static Logger _logger = Logger.getLogger(FastqToBAM.class);
60
61 @Option(longName = "readGroup", description = "read group name")
62 private String readGroupName;
63
64 @Option(longName = "sample", description = "sample name")
65 private String sampleName;
66
67 @Option(longName = "prefix", description = "prefix of the read")
68 private String readPrefix;
69
70 @Argument(index = 0, name = "input fastq (.fastq, .fastq.gz)", required = true)
71 private File input1;
72 @Argument(index = 1, name = "input fastq (.fastq, .fastq.gz, when paried-end read)")
73 private File input2;
74
75 @Option(symbol = "o", longName = "output", description = "output file name (.sam or .bam)")
76 private File outputFile;
77
78 @Option(longName = "tmpdir", description = "temporary directory.for storing merge sort runs. default = System.getProperty(\"java.io.tmpdir\"));")
79 private File tmpDir = new File(System.getProperty("java.io.tmpdir"));
80
81 public static int execute(String[] args) throws Exception {
82
83 FastqToBAM main = new FastqToBAM();
84 OptionParser parser = new OptionParser(main);
85 try {
86 parser.parse(args);
87
88 main.convert();
89 }
90 catch (OptionParserException e) {
91 _logger.error(e);
92 return 1;
93 }
94
95 return 0;
96
97 }
98
99 public int convert() throws UTGBException, IOException {
100 if (input1 == null) {
101 throw new UTGBException(UTGBErrorCode.MISSING_OPTION, "missing fastq file");
102 }
103
104 Reader in1;
105 if (input1.getName().endsWith(".gz")) {
106 in1 = new InputStreamReader(new GZIPInputStream(new FileInputStream(input1)));
107 }
108 else
109 in1 = new FileReader(input1);
110
111 Reader in2 = null;
112 if (input2 != null) {
113 if (input2.getName().endsWith(".gz"))
114 in2 = new InputStreamReader(new GZIPInputStream(new FileInputStream(input2)));
115 else
116 in2 = new FileReader(input2);
117 }
118
119 return convert(new BufferedReader(in1), in2 == null ? null : new BufferedReader(in2));
120 }
121
122 public int convert(Reader input1, Reader input2) throws UTGBException, IOException {
123
124 if (!tmpDir.exists()) {
125 _logger.info("create dir: " + tmpDir.getPath());
126 tmpDir.mkdirs();
127 }
128
129
130 System.setProperty("java.io.tmpdir", tmpDir.getAbsolutePath());
131
132 FastqReader end1 = new FastqReader(input1);
133 FastqReader end2 = (input2 == null) ? null : new FastqReader(input2);
134
135 SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(readGroupName);
136 SAMFileHeader samHeader = new SAMFileHeader();
137 if (readGroupName != null) {
138 samHeader.addReadGroup(readGroupRecord);
139 }
140 readGroupRecord.setSample(sampleName);
141 samHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
142
143 if (outputFile == null)
144 throw new UTGBException(UTGBErrorCode.MISSING_OPTION, "no output file is specified by -o option");
145
146 SAMFileWriter sfw = (new SAMFileWriterFactory()).makeSAMOrBAMWriter(samHeader, false, outputFile);
147 int readsSeen = 0;
148 FastqRead fqr1 = null, fqr2 = null;
149 try {
150 for (; (fqr1 = end1.next()) != null && (end2 == null || (fqr2 = end2.next()) != null);) {
151
152 String fqr1Name = fqr1.seqname;
153
154 SAMRecord sr1 = new SAMRecord(samHeader);
155 sr1.setReadName(readPrefix != null ? (readPrefix + ":" + fqr1Name) : fqr1Name);
156 sr1.setReadString(fqr1.seq);
157 sr1.setBaseQualityString(fqr1.qual);
158 sr1.setReadUnmappedFlag(true);
159 sr1.setReadPairedFlag(false);
160 sr1.setAttribute("RG", readGroupName);
161
162 SAMRecord sr2 = null;
163
164
165 if (fqr2 != null) {
166 sr1.setReadPairedFlag(true);
167 sr1.setFirstOfPairFlag(true);
168 sr1.setSecondOfPairFlag(false);
169 sr1.setMateUnmappedFlag(true);
170
171 String fqr2Name = fqr2.seqname;
172 sr2 = new SAMRecord(samHeader);
173 sr2.setReadName(readPrefix != null ? (readPrefix + ":" + fqr2Name) : fqr2Name);
174 sr2.setReadString(fqr2.seq);
175 sr2.setBaseQualityString(fqr2.qual);
176 sr2.setReadUnmappedFlag(true);
177 sr2.setReadPairedFlag(true);
178 sr2.setAttribute("RG", readGroupName);
179 sr2.setFirstOfPairFlag(false);
180 sr2.setSecondOfPairFlag(true);
181 sr2.setMateUnmappedFlag(true);
182 }
183
184 sfw.addAlignment(sr1);
185 if (fqr2 != null) {
186 sfw.addAlignment(sr2);
187 }
188 readsSeen++;
189
190 if (readsSeen > 0 && (readsSeen % 1000000) == 0) {
191 _logger.info(String.format("%d (paired) reads has been processed.", readsSeen));
192 }
193 }
194 }
195 catch (Exception e) {
196 _logger.error(String.format("error found when reading %d-th read: %s\n%s%s", readsSeen, e.getMessage(), fqr1 != null ? "fq1:\n" + fqr1.toSilk()
197 : "", fqr2 != null ? "fq2:\n" + fqr2.toSilk() : ""));
198 throw UTGBException.convert(e);
199 }
200 finally {
201 if (end1 != null)
202 end1.close();
203 if (end2 != null)
204 end2.close();
205
206 if (sfw != null)
207 sfw.close();
208 }
209
210 return readsSeen;
211 }
212 }