View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // SAMReader.java
20  // Since: Dec 3, 2009
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.format.sam;
26  
27  import java.io.File;
28  import java.io.InputStream;
29  import java.util.ArrayList;
30  import java.util.Collections;
31  import java.util.Comparator;
32  import java.util.HashMap;
33  import java.util.Iterator;
34  import java.util.List;
35  import java.util.PriorityQueue;
36  
37  import net.sf.samtools.SAMFileReader;
38  import net.sf.samtools.SAMFileReader.ValidationStringency;
39  import net.sf.samtools.SAMRecord;
40  import net.sf.samtools.SAMSequenceDictionary;
41  import net.sf.samtools.SAMSequenceRecord;
42  import net.sf.samtools.util.CloseableIterator;
43  
44  import org.utgenome.UTGBException;
45  import org.utgenome.format.wig.WIGDatabaseReader;
46  import org.utgenome.graphics.GenomeWindow;
47  import org.utgenome.gwt.utgb.client.bio.ChrLoc;
48  import org.utgenome.gwt.utgb.client.bio.CompactWIGData;
49  import org.utgenome.gwt.utgb.client.bio.Interval;
50  import org.utgenome.gwt.utgb.client.bio.OnGenome;
51  import org.utgenome.gwt.utgb.client.bio.ReadCoverage;
52  import org.utgenome.gwt.utgb.client.bio.ReadQueryConfig;
53  import org.utgenome.gwt.utgb.client.bio.SAMReadLight;
54  import org.utgenome.gwt.utgb.client.bio.SAMReadPair;
55  import org.utgenome.gwt.utgb.client.bio.SAMReadPairFragment;
56  import org.utgenome.gwt.utgb.client.canvas.IntervalTree;
57  import org.xerial.util.StringUtil;
58  import org.xerial.util.log.Logger;
59  
60  /**
61   * SAM File reader
62   * 
63   * @author leo
64   * 
65   */
66  public class SAMReader {
67  
68  	private static Logger _logger = Logger.getLogger(SAMReader.class);
69  
70  	public static Iterable<SAMRecord> getSAMRecordReader(InputStream samFile) {
71  		return new SAMFileReader(samFile);
72  	}
73  
74  	public List<String> getChrList(File samOrBamFile) {
75  
76  		SAMFileReader sam = new SAMFileReader(samOrBamFile);
77  		try {
78  			sam.setValidationStringency(ValidationStringency.SILENT);
79  
80  			SAMSequenceDictionary dict = sam.getFileHeader().getSequenceDictionary();
81  			List<String> chrList = new ArrayList<String>();
82  			for (SAMSequenceRecord eachSeq : dict.getSequences()) {
83  				chrList.add(eachSeq.getSequenceName());
84  			}
85  			return chrList;
86  		}
87  		finally {
88  			sam.close();
89  		}
90  	}
91  
92  	public static File getBamIndexFile(File bamFile) {
93  		File baiFile = new File(bamFile.getAbsolutePath() + ".bai");
94  		return baiFile;
95  	}
96  
97  	public static interface SAMReadFactory {
98  		public SAMReadLight newSAMRead(SAMRecord r);
99  	}
100 
101 	public static class CompleteSAMReadFactory implements SAMReadFactory {
102 		public SAMReadLight newSAMRead(SAMRecord r) {
103 			return SAM2SilkReader.convertToSAMRead(r);
104 		}
105 	}
106 
107 	public static class LightWeightSAMReadFactory implements SAMReadFactory {
108 		public SAMReadLight newSAMRead(SAMRecord r) {
109 			return SAM2SilkReader.convertToSAMReadLight(r);
110 		}
111 	}
112 
113 	private static class ComputeDepth {
114 
115 		private final GenomeWindow w;
116 		private final int pixelWidth;
117 
118 		public int[] coverage;
119 		private final ChrLoc loc;
120 		private int startCursor = 0;
121 		private int readCount = 0;
122 		private IntervalTree<Interval> intervals = new IntervalTree<Interval>();
123 
124 		private int currentDepth = 0;
125 
126 		public ComputeDepth(ChrLoc loc, int pixelWidth) {
127 			w = new GenomeWindow(loc.start, loc.end);
128 			this.pixelWidth = pixelWidth;
129 			this.loc = loc;
130 			startCursor = loc.start;
131 			coverage = new int[pixelWidth];
132 			for (int i = 0; i < coverage.length; ++i)
133 				coverage[i] = 0;
134 		}
135 
136 		public void computeDepth(List<SAMRecord> loadedReadSet, CloseableIterator<SAMRecord> cursor) {
137 			computeDepth(loadedReadSet.iterator());
138 			computeDepth(cursor);
139 		}
140 
141 		private PriorityQueue<Integer> boundary = new PriorityQueue<Integer>();
142 
143 		public void computeDepth(Iterator<SAMRecord> cursor) {
144 
145 			for (; cursor.hasNext();) {
146 				SAMRecord read = cursor.next();
147 				if (read.getReadUnmappedFlag())
148 					continue;
149 
150 				readCount++;
151 				if (_logger.isDebugEnabled() && readCount > 0 && (readCount % 10000) == 0) {
152 					_logger.debug(String.format("reading %s : %d reads", loc, readCount));
153 				}
154 
155 				int start = read.getAlignmentStart();
156 				int end = read.getAlignmentEnd();
157 				boundary.add(end);
158 
159 				for (; !boundary.isEmpty();) {
160 					int readEnd = boundary.peek();
161 					if (readEnd > start)
162 						break;
163 
164 					setDepth(startCursor, readEnd, currentDepth);
165 					currentDepth--;
166 					startCursor = readEnd;
167 					boundary.poll();
168 				}
169 
170 				setDepth(startCursor, start, currentDepth);
171 				startCursor = start;
172 				currentDepth++;
173 			}
174 
175 			for (; !boundary.isEmpty();) {
176 				int readEnd = boundary.peek();
177 				setDepth(startCursor, readEnd, currentDepth);
178 				currentDepth--;
179 				startCursor = readEnd;
180 				boundary.poll();
181 			}
182 
183 		}
184 
185 		private void setDepth(int start, int end, int depth) {
186 			if (depth <= 0)
187 				return;
188 			int binStart = w.getXPosOnWindow(start, pixelWidth);
189 			int binEnd = w.getXPosOnWindow(end, pixelWidth);
190 			for (int b = binStart; b <= binEnd; ++b) {
191 				if (b < 0)
192 					continue;
193 				if (b >= coverage.length)
194 					break;
195 
196 				coverage[b] = Math.max(coverage[b], currentDepth);
197 			}
198 
199 		}
200 
201 	}
202 
203 	public static List<OnGenome> depthCoverage(ChrLoc loc, int pixelWidth, List<SAMRecord> loadedReadSet, CloseableIterator<SAMRecord> cursor) {
204 
205 		ComputeDepth d = new ComputeDepth(loc, pixelWidth);
206 		d.computeDepth(loadedReadSet, cursor);
207 
208 		List<OnGenome> result = new ArrayList<OnGenome>();
209 		result.add(new ReadCoverage(loc.start, loc.end, pixelWidth, d.coverage));
210 		return result;
211 	}
212 
213 	public static List<OnGenome> depthCoverage(File bamFile, ChrLoc loc, int pixelWidth, ReadQueryConfig config) throws UTGBException {
214 
215 		if (config.wigPath != null && loc.length() >= 10000) { // 10,000b = 10Kb
216 			return depthCoverageInWIG(new File(config.wigPath), loc, pixelWidth, config);
217 		}
218 
219 		if (_logger.isDebugEnabled())
220 			_logger.debug(String.format("depth coverage: %s - %s", bamFile, loc));
221 
222 		File baiFile = getBamIndexFile(bamFile);
223 		SAMFileReader sam = new SAMFileReader(bamFile, baiFile, false);
224 		sam.setValidationStringency(ValidationStringency.SILENT);
225 
226 		// Retrieve SAMRecords from the  BAM file
227 		CloseableIterator<SAMRecord> it = sam.queryOverlapping(loc.chr, loc.start, loc.end);
228 		try {
229 			ComputeDepth d = new ComputeDepth(loc, pixelWidth);
230 			d.computeDepth(it);
231 			List<OnGenome> result = new ArrayList<OnGenome>();
232 			result.add(new ReadCoverage(loc.start, loc.end, pixelWidth, d.coverage));
233 			return result;
234 		}
235 		finally {
236 			if (it != null)
237 				it.close();
238 		}
239 	}
240 
241 	public static List<OnGenome> depthCoverageInWIG(File wigFile, ChrLoc loc, int pixelWidth, ReadQueryConfig config) throws UTGBException {
242 		try {
243 			ArrayList<OnGenome> result = new ArrayList<OnGenome>();
244 
245 			if (_logger.isDebugEnabled())
246 				_logger.debug(String.format("depth coverage in WIG: %s - %s", wigFile, loc));
247 
248 			List<CompactWIGData> wigData = WIGDatabaseReader.getCompactWigDataList(wigFile, pixelWidth, loc, config.window);
249 			for (CompactWIGData each : wigData) {
250 				ReadCoverage rc = each.toReadCoverage(loc);
251 				result.add(rc);
252 				if (_logger.isDebugEnabled()) {
253 					ArrayList<Integer> firstSample = new ArrayList<Integer>();
254 					for (int i = 0; i < 10 && i < rc.coverage.length; ++i) {
255 						firstSample.add(rc.coverage[i]);
256 					}
257 					_logger.debug(String.format("wig: %s, loc:%s, depth:[%s, ...]", wigFile, loc, StringUtil.join(firstSample, ", ")));
258 				}
259 
260 			}
261 			return result;
262 		}
263 		catch (Exception e) {
264 			throw UTGBException.convert(e);
265 		}
266 
267 	}
268 
269 	/**
270 	 * Retrieved SAMReads (or SAMReadPair) overlapped with the specified interval
271 	 * 
272 	 * @param bamFile
273 	 * @param loc
274 	 * @return
275 	 * @throws UTGBException
276 	 */
277 	public static List<OnGenome> overlapQuery(File bamFile, ChrLoc loc, int pixelWidth, ReadQueryConfig config) throws UTGBException {
278 
279 		if (config.wigPath != null && loc.length() >= 10000) {
280 			return depthCoverageInWIG(new File(config.wigPath), loc, pixelWidth, config);
281 		}
282 
283 		File baiFile = getBamIndexFile(bamFile);
284 		SAMFileReader sam = new SAMFileReader(bamFile, baiFile, false);
285 		sam.setValidationStringency(ValidationStringency.SILENT);
286 
287 		if (_logger.isDebugEnabled())
288 			_logger.debug(String.format("query BAM (%s) %s", bamFile, loc));
289 
290 		int readCount = 0;
291 		List<OnGenome> result = new ArrayList<OnGenome>();
292 		{
293 			List<SAMRecord> readSet = new ArrayList<SAMRecord>();
294 
295 			// Retrieve SAMRecords from the  BAM file
296 			CloseableIterator<SAMRecord> it = sam.queryOverlapping(loc.chr, loc.start, loc.end);
297 			try {
298 
299 				for (; it.hasNext();) {
300 					SAMRecord read = it.next();
301 
302 					if (_logger.isDebugEnabled() && readCount > 0 && (readCount % 10000) == 0) {
303 						_logger.debug(String.format("reading (%s) %s : %d reads", bamFile.getName(), loc, readCount));
304 					}
305 
306 					// ignore unmapped reads
307 					if (read.getReadUnmappedFlag())
308 						continue;
309 
310 					readCount++;
311 					readSet.add(read);
312 
313 					if (readCount > config.maxmumNumberOfReadsToDisplay) {
314 						// Switch to the depth-coverage mode
315 						return depthCoverage(loc, pixelWidth, readSet, it);
316 					}
317 
318 				}
319 			}
320 			finally {
321 				if (it != null)
322 					it.close();
323 
324 				sam.close();
325 
326 				if (_logger.isDebugEnabled()) {
327 					_logger.debug(String.format("finished reading (%s) %s : %d reads", bamFile.getName(), loc, readCount));
328 				}
329 			}
330 			SAMReadFactory rf = readCount < 500 ? new CompleteSAMReadFactory() : new LightWeightSAMReadFactory();
331 
332 			// Mating paired-end reads
333 			{
334 				HashMap<String, SAMRecord> samReadTable = new HashMap<String, SAMRecord>();
335 				for (SAMRecord read : readSet) {
336 
337 					// Add single-end read as is
338 					if (!read.getReadPairedFlag()) {
339 						result.add(rf.newSAMRead(read));
340 						continue;
341 					}
342 
343 					// The paired read names must be the same.
344 					if (!samReadTable.containsKey(read.getReadName())) {
345 						// new entry
346 						samReadTable.put(read.getReadName(), read);
347 						continue;
348 					}
349 
350 					// Found a paired-end read set.
351 					SAMRecord mate = samReadTable.get(read.getReadName());
352 					boolean foundPair = false;
353 					if (read.getFirstOfPairFlag()) {
354 						if (mate.getSecondOfPairFlag()) {
355 							result.add(new SAMReadPair(rf.newSAMRead(read), rf.newSAMRead(mate)));
356 							foundPair = true;
357 						}
358 					}
359 					else {
360 						if (mate.getFirstOfPairFlag()) {
361 							result.add(new SAMReadPair(rf.newSAMRead(mate), rf.newSAMRead(read)));
362 							foundPair = true;
363 						}
364 					}
365 
366 					if (!foundPair) {
367 						// The read names are the same, but they are not mated (error?)
368 						result.add(rf.newSAMRead(mate));
369 						result.add(rf.newSAMRead(read));
370 					}
371 
372 					samReadTable.remove(mate.getReadName());
373 				}
374 
375 				// add the remaining reads to the results 
376 				for (SAMRecord each : samReadTable.values()) {
377 					if (each.getReadPairedFlag()) {
378 						if (each.getReferenceName().equals(each.getMateReferenceName())) {
379 							result.add(new SAMReadPairFragment(rf.newSAMRead(each), each.getMateAlignmentStart()));
380 						}
381 						else {
382 							result.add(rf.newSAMRead(each));
383 						}
384 					}
385 				}
386 			}
387 		}
388 
389 		if (_logger.isDebugEnabled()) {
390 			_logger.debug(String.format("sorting (%s) %s : %d reads", bamFile.getName(), loc, readCount));
391 		}
392 
393 		Collections.sort(result, new Comparator<OnGenome>() {
394 			public int compare(OnGenome o1, OnGenome o2) {
395 				int diff = o1.getStart() - o2.getStart();
396 				if (diff == 0) {
397 					return o2.length() - o1.length();
398 				}
399 				else
400 					return diff;
401 			}
402 		});
403 
404 		if (_logger.isDebugEnabled()) {
405 			_logger.debug(String.format("sorting (%s) %s : %d reads. done.", bamFile.getName(), loc, readCount));
406 		}
407 
408 		return result;
409 	}
410 
411 }