1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.sam;
26
27 import java.io.File;
28 import java.io.InputStream;
29 import java.util.ArrayList;
30 import java.util.Collections;
31 import java.util.Comparator;
32 import java.util.HashMap;
33 import java.util.Iterator;
34 import java.util.List;
35 import java.util.PriorityQueue;
36
37 import net.sf.samtools.SAMFileReader;
38 import net.sf.samtools.SAMFileReader.ValidationStringency;
39 import net.sf.samtools.SAMRecord;
40 import net.sf.samtools.SAMSequenceDictionary;
41 import net.sf.samtools.SAMSequenceRecord;
42 import net.sf.samtools.util.CloseableIterator;
43
44 import org.utgenome.UTGBException;
45 import org.utgenome.format.wig.WIGDatabaseReader;
46 import org.utgenome.graphics.GenomeWindow;
47 import org.utgenome.gwt.utgb.client.bio.ChrLoc;
48 import org.utgenome.gwt.utgb.client.bio.CompactWIGData;
49 import org.utgenome.gwt.utgb.client.bio.Interval;
50 import org.utgenome.gwt.utgb.client.bio.OnGenome;
51 import org.utgenome.gwt.utgb.client.bio.ReadCoverage;
52 import org.utgenome.gwt.utgb.client.bio.ReadQueryConfig;
53 import org.utgenome.gwt.utgb.client.bio.SAMReadLight;
54 import org.utgenome.gwt.utgb.client.bio.SAMReadPair;
55 import org.utgenome.gwt.utgb.client.bio.SAMReadPairFragment;
56 import org.utgenome.gwt.utgb.client.canvas.IntervalTree;
57 import org.xerial.util.StringUtil;
58 import org.xerial.util.log.Logger;
59
60
61
62
63
64
65
66 public class SAMReader {
67
68 private static Logger _logger = Logger.getLogger(SAMReader.class);
69
70 public static Iterable<SAMRecord> getSAMRecordReader(InputStream samFile) {
71 return new SAMFileReader(samFile);
72 }
73
74 public List<String> getChrList(File samOrBamFile) {
75
76 SAMFileReader sam = new SAMFileReader(samOrBamFile);
77 try {
78 sam.setValidationStringency(ValidationStringency.SILENT);
79
80 SAMSequenceDictionary dict = sam.getFileHeader().getSequenceDictionary();
81 List<String> chrList = new ArrayList<String>();
82 for (SAMSequenceRecord eachSeq : dict.getSequences()) {
83 chrList.add(eachSeq.getSequenceName());
84 }
85 return chrList;
86 }
87 finally {
88 sam.close();
89 }
90 }
91
92 public static File getBamIndexFile(File bamFile) {
93 File baiFile = new File(bamFile.getAbsolutePath() + ".bai");
94 return baiFile;
95 }
96
97 public static interface SAMReadFactory {
98 public SAMReadLight newSAMRead(SAMRecord r);
99 }
100
101 public static class CompleteSAMReadFactory implements SAMReadFactory {
102 public SAMReadLight newSAMRead(SAMRecord r) {
103 return SAM2SilkReader.convertToSAMRead(r);
104 }
105 }
106
107 public static class LightWeightSAMReadFactory implements SAMReadFactory {
108 public SAMReadLight newSAMRead(SAMRecord r) {
109 return SAM2SilkReader.convertToSAMReadLight(r);
110 }
111 }
112
113 private static class ComputeDepth {
114
115 private final GenomeWindow w;
116 private final int pixelWidth;
117
118 public int[] coverage;
119 private final ChrLoc loc;
120 private int startCursor = 0;
121 private int readCount = 0;
122 private IntervalTree<Interval> intervals = new IntervalTree<Interval>();
123
124 private int currentDepth = 0;
125
126 public ComputeDepth(ChrLoc loc, int pixelWidth) {
127 w = new GenomeWindow(loc.start, loc.end);
128 this.pixelWidth = pixelWidth;
129 this.loc = loc;
130 startCursor = loc.start;
131 coverage = new int[pixelWidth];
132 for (int i = 0; i < coverage.length; ++i)
133 coverage[i] = 0;
134 }
135
136 public void computeDepth(List<SAMRecord> loadedReadSet, CloseableIterator<SAMRecord> cursor) {
137 computeDepth(loadedReadSet.iterator());
138 computeDepth(cursor);
139 }
140
141 private PriorityQueue<Integer> boundary = new PriorityQueue<Integer>();
142
143 public void computeDepth(Iterator<SAMRecord> cursor) {
144
145 for (; cursor.hasNext();) {
146 SAMRecord read = cursor.next();
147 if (read.getReadUnmappedFlag())
148 continue;
149
150 readCount++;
151 if (_logger.isDebugEnabled() && readCount > 0 && (readCount % 10000) == 0) {
152 _logger.debug(String.format("reading %s : %d reads", loc, readCount));
153 }
154
155 int start = read.getAlignmentStart();
156 int end = read.getAlignmentEnd();
157 boundary.add(end);
158
159 for (; !boundary.isEmpty();) {
160 int readEnd = boundary.peek();
161 if (readEnd > start)
162 break;
163
164 setDepth(startCursor, readEnd, currentDepth);
165 currentDepth--;
166 startCursor = readEnd;
167 boundary.poll();
168 }
169
170 setDepth(startCursor, start, currentDepth);
171 startCursor = start;
172 currentDepth++;
173 }
174
175 for (; !boundary.isEmpty();) {
176 int readEnd = boundary.peek();
177 setDepth(startCursor, readEnd, currentDepth);
178 currentDepth--;
179 startCursor = readEnd;
180 boundary.poll();
181 }
182
183 }
184
185 private void setDepth(int start, int end, int depth) {
186 if (depth <= 0)
187 return;
188 int binStart = w.getXPosOnWindow(start, pixelWidth);
189 int binEnd = w.getXPosOnWindow(end, pixelWidth);
190 for (int b = binStart; b <= binEnd; ++b) {
191 if (b < 0)
192 continue;
193 if (b >= coverage.length)
194 break;
195
196 coverage[b] = Math.max(coverage[b], currentDepth);
197 }
198
199 }
200
201 }
202
203 public static List<OnGenome> depthCoverage(ChrLoc loc, int pixelWidth, List<SAMRecord> loadedReadSet, CloseableIterator<SAMRecord> cursor) {
204
205 ComputeDepth d = new ComputeDepth(loc, pixelWidth);
206 d.computeDepth(loadedReadSet, cursor);
207
208 List<OnGenome> result = new ArrayList<OnGenome>();
209 result.add(new ReadCoverage(loc.start, loc.end, pixelWidth, d.coverage));
210 return result;
211 }
212
213 public static List<OnGenome> depthCoverage(File bamFile, ChrLoc loc, int pixelWidth, ReadQueryConfig config) throws UTGBException {
214
215 if (config.wigPath != null && loc.length() >= 10000) {
216 return depthCoverageInWIG(new File(config.wigPath), loc, pixelWidth, config);
217 }
218
219 if (_logger.isDebugEnabled())
220 _logger.debug(String.format("depth coverage: %s - %s", bamFile, loc));
221
222 File baiFile = getBamIndexFile(bamFile);
223 SAMFileReader sam = new SAMFileReader(bamFile, baiFile, false);
224 sam.setValidationStringency(ValidationStringency.SILENT);
225
226
227 CloseableIterator<SAMRecord> it = sam.queryOverlapping(loc.chr, loc.start, loc.end);
228 try {
229 ComputeDepth d = new ComputeDepth(loc, pixelWidth);
230 d.computeDepth(it);
231 List<OnGenome> result = new ArrayList<OnGenome>();
232 result.add(new ReadCoverage(loc.start, loc.end, pixelWidth, d.coverage));
233 return result;
234 }
235 finally {
236 if (it != null)
237 it.close();
238 }
239 }
240
241 public static List<OnGenome> depthCoverageInWIG(File wigFile, ChrLoc loc, int pixelWidth, ReadQueryConfig config) throws UTGBException {
242 try {
243 ArrayList<OnGenome> result = new ArrayList<OnGenome>();
244
245 if (_logger.isDebugEnabled())
246 _logger.debug(String.format("depth coverage in WIG: %s - %s", wigFile, loc));
247
248 List<CompactWIGData> wigData = WIGDatabaseReader.getCompactWigDataList(wigFile, pixelWidth, loc, config.window);
249 for (CompactWIGData each : wigData) {
250 ReadCoverage rc = each.toReadCoverage(loc);
251 result.add(rc);
252 if (_logger.isDebugEnabled()) {
253 ArrayList<Integer> firstSample = new ArrayList<Integer>();
254 for (int i = 0; i < 10 && i < rc.coverage.length; ++i) {
255 firstSample.add(rc.coverage[i]);
256 }
257 _logger.debug(String.format("wig: %s, loc:%s, depth:[%s, ...]", wigFile, loc, StringUtil.join(firstSample, ", ")));
258 }
259
260 }
261 return result;
262 }
263 catch (Exception e) {
264 throw UTGBException.convert(e);
265 }
266
267 }
268
269
270
271
272
273
274
275
276
277 public static List<OnGenome> overlapQuery(File bamFile, ChrLoc loc, int pixelWidth, ReadQueryConfig config) throws UTGBException {
278
279 if (config.wigPath != null && loc.length() >= 10000) {
280 return depthCoverageInWIG(new File(config.wigPath), loc, pixelWidth, config);
281 }
282
283 File baiFile = getBamIndexFile(bamFile);
284 SAMFileReader sam = new SAMFileReader(bamFile, baiFile, false);
285 sam.setValidationStringency(ValidationStringency.SILENT);
286
287 if (_logger.isDebugEnabled())
288 _logger.debug(String.format("query BAM (%s) %s", bamFile, loc));
289
290 int readCount = 0;
291 List<OnGenome> result = new ArrayList<OnGenome>();
292 {
293 List<SAMRecord> readSet = new ArrayList<SAMRecord>();
294
295
296 CloseableIterator<SAMRecord> it = sam.queryOverlapping(loc.chr, loc.start, loc.end);
297 try {
298
299 for (; it.hasNext();) {
300 SAMRecord read = it.next();
301
302 if (_logger.isDebugEnabled() && readCount > 0 && (readCount % 10000) == 0) {
303 _logger.debug(String.format("reading (%s) %s : %d reads", bamFile.getName(), loc, readCount));
304 }
305
306
307 if (read.getReadUnmappedFlag())
308 continue;
309
310 readCount++;
311 readSet.add(read);
312
313 if (readCount > config.maxmumNumberOfReadsToDisplay) {
314
315 return depthCoverage(loc, pixelWidth, readSet, it);
316 }
317
318 }
319 }
320 finally {
321 if (it != null)
322 it.close();
323
324 sam.close();
325
326 if (_logger.isDebugEnabled()) {
327 _logger.debug(String.format("finished reading (%s) %s : %d reads", bamFile.getName(), loc, readCount));
328 }
329 }
330 SAMReadFactory rf = readCount < 500 ? new CompleteSAMReadFactory() : new LightWeightSAMReadFactory();
331
332
333 {
334 HashMap<String, SAMRecord> samReadTable = new HashMap<String, SAMRecord>();
335 for (SAMRecord read : readSet) {
336
337
338 if (!read.getReadPairedFlag()) {
339 result.add(rf.newSAMRead(read));
340 continue;
341 }
342
343
344 if (!samReadTable.containsKey(read.getReadName())) {
345
346 samReadTable.put(read.getReadName(), read);
347 continue;
348 }
349
350
351 SAMRecord mate = samReadTable.get(read.getReadName());
352 boolean foundPair = false;
353 if (read.getFirstOfPairFlag()) {
354 if (mate.getSecondOfPairFlag()) {
355 result.add(new SAMReadPair(rf.newSAMRead(read), rf.newSAMRead(mate)));
356 foundPair = true;
357 }
358 }
359 else {
360 if (mate.getFirstOfPairFlag()) {
361 result.add(new SAMReadPair(rf.newSAMRead(mate), rf.newSAMRead(read)));
362 foundPair = true;
363 }
364 }
365
366 if (!foundPair) {
367
368 result.add(rf.newSAMRead(mate));
369 result.add(rf.newSAMRead(read));
370 }
371
372 samReadTable.remove(mate.getReadName());
373 }
374
375
376 for (SAMRecord each : samReadTable.values()) {
377 if (each.getReadPairedFlag()) {
378 if (each.getReferenceName().equals(each.getMateReferenceName())) {
379 result.add(new SAMReadPairFragment(rf.newSAMRead(each), each.getMateAlignmentStart()));
380 }
381 else {
382 result.add(rf.newSAMRead(each));
383 }
384 }
385 }
386 }
387 }
388
389 if (_logger.isDebugEnabled()) {
390 _logger.debug(String.format("sorting (%s) %s : %d reads", bamFile.getName(), loc, readCount));
391 }
392
393 Collections.sort(result, new Comparator<OnGenome>() {
394 public int compare(OnGenome o1, OnGenome o2) {
395 int diff = o1.getStart() - o2.getStart();
396 if (diff == 0) {
397 return o2.length() - o1.length();
398 }
399 else
400 return diff;
401 }
402 });
403
404 if (_logger.isDebugEnabled()) {
405 _logger.debug(String.format("sorting (%s) %s : %d reads. done.", bamFile.getName(), loc, readCount));
406 }
407
408 return result;
409 }
410
411 }