1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.utgenome.util.sv;
24
25 import java.io.BufferedReader;
26 import java.io.FileReader;
27 import java.io.IOException;
28 import java.io.Reader;
29 import java.util.ArrayList;
30 import java.util.List;
31
32 import org.utgenome.UTGBException;
33 import org.utgenome.format.bed.BED2SilkReader;
34 import org.utgenome.format.fasta.CompactFASTA;
35 import org.utgenome.format.fasta.Kmer;
36 import org.utgenome.gwt.utgb.client.bio.ACGTEncoder;
37 import org.utgenome.gwt.utgb.client.bio.AminoAcid;
38 import org.utgenome.gwt.utgb.client.bio.BEDGene;
39 import org.utgenome.gwt.utgb.client.bio.CDS;
40 import org.utgenome.gwt.utgb.client.bio.CodonTable;
41 import org.utgenome.gwt.utgb.client.bio.Exon;
42 import org.utgenome.gwt.utgb.client.bio.Gene;
43 import org.utgenome.gwt.utgb.client.bio.Interval;
44 import org.utgenome.gwt.utgb.client.canvas.IntervalTree;
45 import org.utgenome.util.StandardOutputStream;
46 import org.utgenome.util.kmer.KmerIntegerFactory;
47 import org.utgenome.util.sv.EnhancedGeneticVariation.MutationPosition;
48 import org.xerial.lens.SilkLens;
49 import org.xerial.silk.SilkWriter;
50 import org.xerial.util.ObjectHandler;
51 import org.xerial.util.log.Logger;
52 import org.xerial.util.opt.Argument;
53 import org.xerial.util.opt.OptionParser;
54 import org.xerial.util.opt.OptionParserException;
55
56
57
58
59
60
61
62
63
64 public class VariationAnnotator {
65
66 private static Logger _logger = Logger.getLogger(VariationAnnotator.class);
67
68 @Argument(index = 0)
69 private String fastaFile;
70
71 @Argument(index = 1)
72 private String geneBED;
73
74 @Argument(index = 2)
75 private String variationPosFile;
76
77 private CompactFASTA fasta;
78 private IntervalTree<BEDGene> geneSet = new IntervalTree<BEDGene>();
79
80 public VariationAnnotator() {
81
82 }
83
84 public static void main(String[] args) {
85 VariationAnnotator va = new VariationAnnotator();
86 OptionParser opt = new OptionParser(va);
87
88 try {
89 opt.parse(args);
90 va.execute();
91 }
92 catch (OptionParserException e) {
93 _logger.error(e);
94 }
95 catch (Exception e) {
96 _logger.error(e);
97 e.printStackTrace(System.err);
98 }
99
100 }
101
102 public void execute() throws Exception {
103
104
105 _logger.info("loading FASTA pac file: " + fastaFile);
106 fasta = CompactFASTA.loadIntoMemory(fastaFile);
107
108
109 _logger.info("loading gene information: " + geneBED);
110 Reader bedReader = new BED2SilkReader(new BufferedReader(new FileReader(geneBED)));
111 try {
112 SilkLens.findFromSilk(bedReader, "gene", BEDGene.class, new ObjectHandler<BEDGene>() {
113 public void finish() throws Exception {
114 _logger.info(String.format("loaded %d genes", geneSet.size()));
115 }
116
117 public void handle(BEDGene gene) throws Exception {
118 geneSet.add(gene);
119 }
120
121 public void init() throws Exception {
122 _logger.info("loading gene BED file...");
123 }
124 });
125 }
126 finally {
127 bedReader.close();
128 }
129
130
131 final SilkWriter silk = new SilkWriter(new StandardOutputStream());
132
133
134
135 _logger.info("loading variation data...");
136 SilkLens.findFromSilk(new BufferedReader(new FileReader(variationPosFile)), "variation", GeneticVariation.class, new ObjectHandler<GeneticVariation>() {
137 int count = 0;
138
139 public void finish() throws Exception {
140 _logger.info("The end of the variation data");
141 }
142
143 public void handle(GeneticVariation v) throws Exception {
144 count++;
145 List<EnhancedGeneticVariation> annotation = annotate(v);
146 for (Object each : annotation) {
147 silk.leafObject("snv", each);
148 }
149
150 if ((count % 10000) == 0) {
151 _logger.info(String.format("processed %,d mutations", count));
152 }
153 }
154
155 public void init() throws Exception {
156 _logger.info("annotating variations...");
157 }
158 });
159
160 silk.endDocument();
161 silk.close();
162
163 }
164
165 void output(GeneticVariation v) {
166 _logger.info(v);
167 }
168
169 private static KmerIntegerFactory kif = new KmerIntegerFactory(3);
170
171
172
173
174
175
176
177
178
179 List<EnhancedGeneticVariation> annotate(GeneticVariation v) throws IOException, UTGBException {
180
181 List<EnhancedGeneticVariation> result = new ArrayList<EnhancedGeneticVariation>();
182
183
184 List<BEDGene> overlappedGeneSet = geneSet.overlapQuery(v.start);
185
186 if (overlappedGeneSet.isEmpty()) {
187 EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
188 annot.mutationPosition = MutationPosition.InterGenic;
189
190 result.add(annot);
191 return result;
192 }
193
194
195 for (BEDGene eachGene : overlappedGeneSet) {
196 final int numExon = eachGene.getExon().size();
197 final CDS cds = eachGene.cdsRange();
198
199
200 if (!cds.contains(v.start)) {
201 EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
202 annot.geneName = eachGene.getName();
203 boolean is5pUTR = (v.start < cds.getStart()) && eachGene.isSense();
204 annot.mutationPosition = is5pUTR ? MutationPosition.UTR5 : MutationPosition.UTR3;
205 result.add(annot);
206 continue;
207 }
208
209
210 boolean foundVariation = false;
211 for (int exonIndex = 0; exonIndex < numExon; exonIndex++) {
212 final Exon exon = eachGene.getExon(eachGene.isSense() ? exonIndex : numExon - exonIndex - 1);
213 final int exonStart = exon.getStart();
214 final int exonEnd = exon.getEnd();
215
216 if (!exon.contains(v.start)) {
217
218 final Interval spliceSite5p = new Interval(exonStart - 2, exonStart);
219 if (spliceSite5p.contains(v.start)) {
220 EnhancedGeneticVariation annot = createReport(v, eachGene, MutationPosition.SS5);
221 result.add(annot);
222 foundVariation = true;
223 break;
224 }
225 final Interval spliceSite3p = new Interval(exonEnd, exonEnd + 2);
226 if (spliceSite3p.contains(v.start)) {
227 EnhancedGeneticVariation annot = createReport(v, eachGene, MutationPosition.SS3);
228 result.add(annot);
229 foundVariation = true;
230 break;
231 }
232
233 continue;
234 }
235
236
237 foundVariation = true;
238
239
240 int cdsStart = cds.contains(exonStart) ? exonStart : cds.getStart();
241 int cdsEnd = cds.contains(exonEnd) ? exonEnd : cds.getEnd();
242 final int distFromBoundary = (eachGene.isSense() ? v.start - cdsStart : cdsEnd - v.start - 1);
243 final int frameIndex = distFromBoundary / 3;
244 final int frameStart = eachGene.isSense() ? cdsStart + 3 * frameIndex : cdsEnd - 3 * (frameIndex + 1);
245
246
247 int frameOffset = (v.start - frameStart) % 3;
248 Kmer refCodon = new Kmer(fasta.getSequence(v.chr, frameStart, frameStart + 3));
249
250
251 final MutationPosition exonPos = getExonPosition(exonIndex, numExon);
252
253
254 switch (v.variationType) {
255 case Mutation: {
256
257 final String genoType = v.getAltBase().toGenoType();
258 for (int i = 0; i < genoType.length(); i++) {
259 final char altBase = genoType.charAt(i);
260 Kmer altCodon = new Kmer(refCodon);
261 altCodon.set(frameOffset, altBase);
262 if (!refCodon.equals(altCodon)) {
263 EnhancedGeneticVariation annot = createReport(v, eachGene, exonPos, refCodon);
264 AminoAcid altAA = CodonTable.toAminoAcid(eachGene.isSense() ? altCodon.toInt() : kif.reverseComplement(altCodon.toInt()));
265 annot.altAA = altAA;
266 annot.altCodon = altCodon.toString();
267 result.add(annot);
268 }
269 }
270
271 break;
272 }
273 case Deletion: {
274 EnhancedGeneticVariation annot = createReport(v, eachGene, exonPos, refCodon);
275 final int suffixStart = v.start + v.indelLength;
276 if (eachGene.isSense()) {
277 Kmer altCodon = new Kmer(fasta.getSequence(v.chr, frameStart, v.start));
278 altCodon.append(fasta.getSequence(v.chr, suffixStart, suffixStart + (3 - (v.start - frameStart))).toString());
279 annot.altAA = CodonTable.toAminoAcid(altCodon.toInt());
280 annot.altCodon = altCodon.toString();
281 }
282 else {
283 int newCDSEnd = cdsEnd - v.indelLength;
284 if (cdsEnd < v.start) {
285
286 }
287 else {
288 int newFrameIndex = (newCDSEnd - v.start - 1) / 3;
289 int newFrameStart = newCDSEnd - 3 * (frameIndex + 1);
290 Kmer altCodon = new Kmer(fasta.getSequence(v.chr, newFrameStart, v.start));
291 altCodon.append(fasta.getSequence(v.chr, suffixStart, suffixStart + (3 - (v.start - newFrameStart))).toString());
292 altCodon = altCodon.reverseComplement();
293
294 annot.altAA = CodonTable.toAminoAcid(altCodon.toInt());
295 annot.altCodon = altCodon.toString();
296 }
297 }
298 result.add(annot);
299 break;
300 }
301 case Insertion: {
302 EnhancedGeneticVariation annot = createReport(v, eachGene, exonPos, refCodon);
303 Kmer altCodon = new Kmer(refCodon).insert(frameOffset, annot.getGenotype().substring(1));
304 if (eachGene.isAntiSense()) {
305 altCodon.reverseComplement();
306 }
307
308 int altKmer = altCodon.toInt(3);
309 annot.altAA = CodonTable.toAminoAcid(altKmer);
310 annot.altCodon = ACGTEncoder.toString(altKmer, 3);
311 result.add(annot);
312 break;
313 }
314 default:
315 break;
316 }
317 }
318
319 if (!foundVariation) {
320
321 EnhancedGeneticVariation annot = createReport(v, eachGene, MutationPosition.Intron);
322 result.add(annot);
323 }
324 }
325
326 return result;
327
328 }
329
330 private static EnhancedGeneticVariation createReport(GeneticVariation v, Gene gene, MutationPosition pos) {
331 EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
332 annot.strand = gene.isSense() ? "+" : "-";
333 annot.geneName = gene.getName();
334 annot.mutationPosition = pos;
335 return annot;
336 }
337
338 private static EnhancedGeneticVariation createReport(GeneticVariation v, Gene gene, MutationPosition pos, Kmer refCodon) {
339 EnhancedGeneticVariation annot = new EnhancedGeneticVariation(v);
340 annot.strand = gene.isSense() ? "+" : "-";
341 annot.geneName = gene.getName();
342 annot.mutationPosition = pos;
343 annot.refCodon = refCodon.toString();
344 annot.refAA = CodonTable.toAminoAcid(gene.isSense() ? refCodon.toInt(3) : kif.reverseComplement(refCodon.toInt(3)));
345 return annot;
346
347 }
348
349 private static MutationPosition getExonPosition(int exonPos, int numExon) {
350 return (exonPos == 0) ? MutationPosition.FirstExon : (exonPos == numExon - 1) ? MutationPosition.LastExon : MutationPosition.Exon;
351 }
352
353 }