View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2010 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // VariationAnnotator.java
20  // Since: 2010/10/13
21  //
22  //--------------------------------------
23  package org.utgenome.util.sv;
24  
25  import java.io.BufferedReader;
26  import java.io.FileReader;
27  import java.io.IOException;
28  import java.io.Reader;
29  import java.util.ArrayList;
30  import java.util.List;
31  
32  import org.utgenome.UTGBException;
33  import org.utgenome.format.bed.BED2SilkReader;
34  import org.utgenome.format.fasta.CompactFASTA;
35  import org.utgenome.format.fasta.Kmer;
36  import org.utgenome.gwt.utgb.client.bio.ACGTEncoder;
37  import org.utgenome.gwt.utgb.client.bio.AminoAcid;
38  import org.utgenome.gwt.utgb.client.bio.BEDGene;
39  import org.utgenome.gwt.utgb.client.bio.CDS;
40  import org.utgenome.gwt.utgb.client.bio.CodonTable;
41  import org.utgenome.gwt.utgb.client.bio.Exon;
42  import org.utgenome.gwt.utgb.client.bio.Gene;
43  import org.utgenome.gwt.utgb.client.bio.Interval;
44  import org.utgenome.gwt.utgb.client.canvas.IntervalTree;
45  import org.utgenome.util.StandardOutputStream;
46  import org.utgenome.util.kmer.KmerIntegerFactory;
47  import org.utgenome.util.sv.EnhancedGeneticVariation.MutationPosition;
48  import org.xerial.lens.SilkLens;
49  import org.xerial.silk.SilkWriter;
50  import org.xerial.util.ObjectHandler;
51  import org.xerial.util.log.Logger;
52  import org.xerial.util.opt.Argument;
53  import org.xerial.util.opt.OptionParser;
54  import org.xerial.util.opt.OptionParserException;
55  
56  /**
57   * Annotating structural variation types (e.g., synonymous mutation, frame shift, etc.). This code is based on
58   * Higasa-san's Perl script.
59   * 
60   * @author leo
61   * @author higasa
62   * 
63   */
64  public class VariationAnnotator {
65  
66  	private static Logger _logger = Logger.getLogger(VariationAnnotator.class);
67  
68  	@Argument(index = 0)
69  	private String fastaFile;
70  
71  	@Argument(index = 1)
72  	private String geneBED;
73  
74  	@Argument(index = 2)
75  	private String variationPosFile;
76  
77  	private CompactFASTA fasta;
78  	private IntervalTree<BEDGene> geneSet = new IntervalTree<BEDGene>();
79  
80  	public VariationAnnotator() {
81  
82  	}
83  
84  	public static void main(String[] args) {
85  		VariationAnnotator va = new VariationAnnotator();
86  		OptionParser opt = new OptionParser(va);
87  
88  		try {
89  			opt.parse(args);
90  			va.execute();
91  		}
92  		catch (OptionParserException e) {
93  			_logger.error(e);
94  		}
95  		catch (Exception e) {
96  			_logger.error(e);
97  			e.printStackTrace(System.err);
98  		}
99  
100 	}
101 
102 	public void execute() throws Exception {
103 
104 		// load FASTA (packed via utgb pack)
105 		_logger.info("loading FASTA pac file: " + fastaFile);
106 		fasta = CompactFASTA.loadIntoMemory(fastaFile);
107 
108 		// load BED
109 		_logger.info("loading gene information: " + geneBED);
110 		Reader bedReader = new BED2SilkReader(new BufferedReader(new FileReader(geneBED)));
111 		try {
112 			SilkLens.findFromSilk(bedReader, "gene", BEDGene.class, new ObjectHandler<BEDGene>() {
113 				public void finish() throws Exception {
114 					_logger.info(String.format("loaded %d genes", geneSet.size()));
115 				}
116 
117 				public void handle(BEDGene gene) throws Exception {
118 					geneSet.add(gene);
119 				}
120 
121 				public void init() throws Exception {
122 					_logger.info("loading gene BED file...");
123 				}
124 			});
125 		}
126 		finally {
127 			bedReader.close();
128 		}
129 
130 		// data output
131 		final SilkWriter silk = new SilkWriter(new StandardOutputStream());
132 
133 		// load variation position file
134 		// TODO parallelization
135 		_logger.info("loading variation data...");
136 		SilkLens.findFromSilk(new BufferedReader(new FileReader(variationPosFile)), "variation", GeneticVariation.class, new ObjectHandler<GeneticVariation>() {
137 			int count = 0;
138 
139 			public void finish() throws Exception {
140 				_logger.info("The end of the variation data");
141 			}
142 
143 			public void handle(GeneticVariation v) throws Exception {
144 				count++;
145 				List<EnhancedGeneticVariation> annotation = annotate(v);
146 				for (Object each : annotation) {
147 					silk.leafObject("snv", each);
148 				}
149 
150 				if ((count % 10000) == 0) {
151 					_logger.info(String.format("processed %,d mutations", count));
152 				}
153 			}
154 
155 			public void init() throws Exception {
156 				_logger.info("annotating variations...");
157 			}
158 		});
159 
160 		silk.endDocument();
161 		silk.close();
162 
163 	}
164 
165 	void output(GeneticVariation v) {
166 		_logger.info(v);
167 	}
168 
169 	private static KmerIntegerFactory kif = new KmerIntegerFactory(3);
170 
171 	/**
172 	 * Annotate the given genetic variation using the reference sequence and gene set.
173 	 * 
174 	 * @param v
175 	 * @return
176 	 * @throws UTGBException
177 	 * @throws IOException
178 	 */
179 	List<EnhancedGeneticVariation> annotate(GeneticVariation v) throws IOException, UTGBException {
180 
181 		List<EnhancedGeneticVariation> result = new ArrayList<EnhancedGeneticVariation>();
182 
183 		// Find genes containing the variation
184 		List<BEDGene> overlappedGeneSet = geneSet.overlapQuery(v.start);
185 
186 		if (overlappedGeneSet.isEmpty()) { // The variation is in an inter-genic region		
187 			EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
188 			annot.mutationPosition = MutationPosition.InterGenic;
189 
190 			result.add(annot);
191 			return result;
192 		}
193 
194 		// The variation is in an exon/intron region
195 		for (BEDGene eachGene : overlappedGeneSet) {
196 			final int numExon = eachGene.getExon().size();
197 			final CDS cds = eachGene.cdsRange();
198 
199 			// Is in UTR?
200 			if (!cds.contains(v.start)) {
201 				EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
202 				annot.geneName = eachGene.getName();
203 				boolean is5pUTR = (v.start < cds.getStart()) && eachGene.isSense();
204 				annot.mutationPosition = is5pUTR ? MutationPosition.UTR5 : MutationPosition.UTR3;
205 				result.add(annot);
206 				continue;
207 			}
208 
209 			// The variation is contained in CDS 
210 			boolean foundVariation = false;
211 			for (int exonIndex = 0; exonIndex < numExon; exonIndex++) {
212 				final Exon exon = eachGene.getExon(eachGene.isSense() ? exonIndex : numExon - exonIndex - 1);
213 				final int exonStart = exon.getStart();
214 				final int exonEnd = exon.getEnd();
215 
216 				if (!exon.contains(v.start)) {
217 					// Is in splice site? 
218 					final Interval spliceSite5p = new Interval(exonStart - 2, exonStart);
219 					if (spliceSite5p.contains(v.start)) {
220 						EnhancedGeneticVariation annot = createReport(v, eachGene, MutationPosition.SS5);
221 						result.add(annot);
222 						foundVariation = true;
223 						break;
224 					}
225 					final Interval spliceSite3p = new Interval(exonEnd, exonEnd + 2);
226 					if (spliceSite3p.contains(v.start)) {
227 						EnhancedGeneticVariation annot = createReport(v, eachGene, MutationPosition.SS3);
228 						result.add(annot);
229 						foundVariation = true;
230 						break;
231 					}
232 					// forward to the next exon 
233 					continue;
234 				}
235 
236 				// This variation is in an exon region
237 				foundVariation = true;
238 
239 				// Codon frame
240 				int cdsStart = cds.contains(exonStart) ? exonStart : cds.getStart();
241 				int cdsEnd = cds.contains(exonEnd) ? exonEnd : cds.getEnd();
242 				final int distFromBoundary = (eachGene.isSense() ? v.start - cdsStart : cdsEnd - v.start - 1);
243 				final int frameIndex = distFromBoundary / 3;
244 				final int frameStart = eachGene.isSense() ? cdsStart + 3 * frameIndex : cdsEnd - 3 * (frameIndex + 1);
245 
246 				// Get the codon in the reference sequence
247 				int frameOffset = (v.start - frameStart) % 3;
248 				Kmer refCodon = new Kmer(fasta.getSequence(v.chr, frameStart, frameStart + 3));
249 
250 				// Get the mutation position (first exon, exon, last exon) 
251 				final MutationPosition exonPos = getExonPosition(exonIndex, numExon);
252 
253 				// Create a variation report for each mutation type
254 				switch (v.variationType) {
255 				case Mutation: {
256 					// Check alternative amino acids 
257 					final String genoType = v.getAltBase().toGenoType();
258 					for (int i = 0; i < genoType.length(); i++) {
259 						final char altBase = genoType.charAt(i);
260 						Kmer altCodon = new Kmer(refCodon);
261 						altCodon.set(frameOffset, altBase);
262 						if (!refCodon.equals(altCodon)) {
263 							EnhancedGeneticVariation annot = createReport(v, eachGene, exonPos, refCodon);
264 							AminoAcid altAA = CodonTable.toAminoAcid(eachGene.isSense() ? altCodon.toInt() : kif.reverseComplement(altCodon.toInt()));
265 							annot.altAA = altAA;
266 							annot.altCodon = altCodon.toString();
267 							result.add(annot);
268 						}
269 					}
270 
271 					break;
272 				}
273 				case Deletion: {
274 					EnhancedGeneticVariation annot = createReport(v, eachGene, exonPos, refCodon);
275 					final int suffixStart = v.start + v.indelLength;
276 					if (eachGene.isSense()) {
277 						Kmer altCodon = new Kmer(fasta.getSequence(v.chr, frameStart, v.start));
278 						altCodon.append(fasta.getSequence(v.chr, suffixStart, suffixStart + (3 - (v.start - frameStart))).toString());
279 						annot.altAA = CodonTable.toAminoAcid(altCodon.toInt());
280 						annot.altCodon = altCodon.toString();
281 					}
282 					else {
283 						int newCDSEnd = cdsEnd - v.indelLength;
284 						if (cdsEnd < v.start) {
285 							// broken CDS. unable to determine the codon frame
286 						}
287 						else {
288 							int newFrameIndex = (newCDSEnd - v.start - 1) / 3;
289 							int newFrameStart = newCDSEnd - 3 * (frameIndex + 1);
290 							Kmer altCodon = new Kmer(fasta.getSequence(v.chr, newFrameStart, v.start));
291 							altCodon.append(fasta.getSequence(v.chr, suffixStart, suffixStart + (3 - (v.start - newFrameStart))).toString());
292 							altCodon = altCodon.reverseComplement();
293 
294 							annot.altAA = CodonTable.toAminoAcid(altCodon.toInt());
295 							annot.altCodon = altCodon.toString();
296 						}
297 					}
298 					result.add(annot);
299 					break;
300 				}
301 				case Insertion: {
302 					EnhancedGeneticVariation annot = createReport(v, eachGene, exonPos, refCodon);
303 					Kmer altCodon = new Kmer(refCodon).insert(frameOffset, annot.getGenotype().substring(1));
304 					if (eachGene.isAntiSense()) {
305 						altCodon.reverseComplement();
306 					}
307 
308 					int altKmer = altCodon.toInt(3);
309 					annot.altAA = CodonTable.toAminoAcid(altKmer);
310 					annot.altCodon = ACGTEncoder.toString(altKmer, 3);
311 					result.add(annot);
312 					break;
313 				}
314 				default:
315 					break;
316 				}
317 			}
318 
319 			if (!foundVariation) { // when no overlap with exons/SS is found
320 				// intron
321 				EnhancedGeneticVariation annot = createReport(v, eachGene, MutationPosition.Intron);
322 				result.add(annot);
323 			}
324 		}
325 
326 		return result;
327 
328 	}
329 
330 	private static EnhancedGeneticVariation createReport(GeneticVariation v, Gene gene, MutationPosition pos) {
331 		EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
332 		annot.strand = gene.isSense() ? "+" : "-";
333 		annot.geneName = gene.getName();
334 		annot.mutationPosition = pos;
335 		return annot;
336 	}
337 
338 	private static EnhancedGeneticVariation createReport(GeneticVariation v, Gene gene, MutationPosition pos, Kmer refCodon) {
339 		EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
340 		annot.strand = gene.isSense() ? "+" : "-";
341 		annot.geneName = gene.getName();
342 		annot.mutationPosition = pos;
343 		annot.refCodon = refCodon.toString();
344 		annot.refAA = CodonTable.toAminoAcid(gene.isSense() ? refCodon.toInt(3) : kif.reverseComplement(refCodon.toInt(3)));
345 		return annot;
346 
347 	}
348 
349 	private static MutationPosition getExonPosition(int exonPos, int numExon) {
350 		return (exonPos == 0) ? MutationPosition.FirstExon : (exonPos == numExon - 1) ? MutationPosition.LastExon : MutationPosition.Exon;
351 	}
352 
353 }