View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // BED2Silk.java
20  // Since: 2009/05/07
21  //
22  // $URL: http://svn.utgenome.org/utgb/trunk/utgb/utgb-shell/src/main/java/org/utgenome/shell/db/bed/BED2Silk.java $ 
23  // $Author: leo $
24  //--------------------------------------
25  package org.utgenome.format.bed;
26  
27  import java.io.BufferedReader;
28  import java.io.File;
29  import java.io.FileReader;
30  import java.io.IOException;
31  import java.io.PrintWriter;
32  import java.io.Reader;
33  import java.io.StringReader;
34  import java.io.StringWriter;
35  import java.util.ArrayList;
36  import java.util.zip.DataFormatException;
37  
38  import org.antlr.runtime.ANTLRReaderStream;
39  import org.antlr.runtime.CommonTokenStream;
40  import org.antlr.runtime.RecognitionException;
41  import org.antlr.runtime.tree.Tree;
42  import org.utgenome.UTGBErrorCode;
43  import org.utgenome.UTGBException;
44  import org.utgenome.gwt.utgb.client.util.StringUtil;
45  import org.xerial.core.XerialException;
46  import org.xerial.lens.Lens;
47  import org.xerial.silk.SilkWriter;
48  import org.xerial.util.log.Logger;
49  
50  /**
51   * Converting BED into Silk format.
52   * 
53   * 
54   * <p>
55   * Note that BED is a 0-based gene data format, while UTGB uses 1-based [start, end) interval representation. To fill
56   * the gap between BED and UTGB, BED2Silk translates BED's 0-based entries into 1-based ones.
57   * </p>
58   * 
59   * 
60   * @author yoshimura
61   * 
62   */
63  public class BED2Silk {
64  
65  	private static Logger _logger = Logger.getLogger(BED2Silk.class);
66  	private final BufferedReader reader;
67  
68  	public static class BEDHeaderDescription {
69  		String name;
70  		ArrayList<BEDHeaderAttribute> attributes = new ArrayList<BEDHeaderAttribute>();
71  
72  		public void setName(String name) {
73  			this.name = name;
74  		}
75  
76  		public void addAttribute(BEDHeaderAttribute attribute) {
77  			attributes.add(attribute);
78  		}
79  
80  		@Override
81  		public String toString() {
82  			return String.format("name=%s, attributes=%s", name, attributes.toString());
83  		}
84  	}
85  
86  	public static class BEDHeaderAttribute {
87  		String name;
88  		String value;
89  
90  		public void setName(String name) {
91  			this.name = name;
92  		}
93  
94  		public void setValue(String value) {
95  			this.value = value;
96  		}
97  
98  		@Override
99  		public String toString() {
100 			return String.format("{name=%s, value=%s}", name, value);
101 		}
102 	}
103 
104 	public BED2Silk(File bedFile) throws IOException {
105 		this(new FileReader(bedFile));
106 	}
107 
108 	/**
109 	 * 
110 	 * @param bedFile
111 	 * @throws IOException
112 	 */
113 	public BED2Silk(Reader bedFile) throws IOException {
114 		this.reader = new BufferedReader(bedFile);
115 
116 	}
117 
118 	public void close() throws IOException {
119 		if (reader != null)
120 			reader.close();
121 	}
122 
123 	/**
124 	 * Convert a BED's gene line into a Silk's tab-delimited format
125 	 * 
126 	 * @param line
127 	 * @param lineNum
128 	 * @return
129 	 * @throws DataFormatException
130 	 * @throws UTGBException
131 	 */
132 	private String createGeneTSV(String line, int lineNum) throws UTGBException {
133 
134 		try {
135 			String[] gene = readBEDLine(line);
136 			StringBuilder sb = new StringBuilder();
137 			if (gene.length < 3) {
138 				throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d doesn't have 3 columns: %s", lineNum, line));
139 			}
140 
141 			int start = Integer.parseInt(gene[1]) + 1;
142 			int end = Integer.parseInt(gene[2]) + 1;
143 
144 			// print "coordinate.name, start, end"
145 			sb.append(String.format("%s\t%d\t%d\t", gene[0], start, end));
146 			// print "name"
147 			if (gene.length >= 4) {
148 				sb.append(gene[3]);
149 			}
150 			// print "strand"
151 			sb.append("\t");
152 			if (gene.length >= 6) {
153 				if (gene[5].equals("+") || gene[5].equals("-")) {
154 					sb.append(gene[5]);
155 				}
156 				else {
157 					_logger.warn(String.format("Illegal strand value '%s'. Using '+' instead. ", gene[5]));
158 					sb.append("+");
159 				}
160 			}
161 			// print "cds"
162 			sb.append("\t");
163 			if (gene.length >= 8) {
164 				int cdsStart = Integer.parseInt(gene[6]) + 1;
165 				int cdsEnd = Integer.parseInt(gene[7]) + 1;
166 				sb.append(String.format("[%d, %d]", cdsStart, cdsEnd));
167 			}
168 			// print "exon"
169 			sb.append("\t");
170 			if (gene.length >= 12) {
171 				String[] blockSizes = gene[10].split(",");
172 				String[] blockStarts = gene[11].split(",");
173 
174 				sb.append("[");
175 				Integer nExons = Integer.parseInt(gene[9]);
176 				for (int i = 0; i < nExons; i++) {
177 					int startExon = start + Integer.parseInt(blockStarts[i]);
178 					int endExon = startExon + Integer.parseInt(blockSizes[i]);
179 					sb.append("[" + startExon + ", " + endExon + "]");
180 					if (i < nExons - 1) {
181 						sb.append(", ");
182 					}
183 				}
184 				sb.append("]");
185 			}
186 
187 			// print "color"
188 			sb.append("\t");
189 			if (gene.length >= 9) {
190 				sb.append(changeRGB2Hex(gene[8]));
191 			}
192 			// print "score"
193 			sb.append("\t");
194 			if (gene.length >= 5) {
195 				sb.append("{\"score\":" + gene[4] + "}");
196 			}
197 
198 			return sb.toString();
199 		}
200 		catch (NumberFormatException e) {
201 			throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d: %s", lineNum, e));
202 		}
203 		catch (DataFormatException e) {
204 			throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d: %s", lineNum, e));
205 		}
206 		catch (IllegalArgumentException e) {
207 			throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d: %s", lineNum, e));
208 		}
209 
210 	}
211 
212 	/**
213 	 * 
214 	 * @param out
215 	 * @throws IOException
216 	 * @throws UTGBShellException
217 	 */
218 	public void toSilk(PrintWriter pout) throws IOException, UTGBException {
219 
220 		SilkWriter out = new SilkWriter(pout);
221 
222 		// print header line
223 		out.preamble();
224 
225 		int geneCount = 0;
226 
227 		int lineNum = 1;
228 		for (String line; (line = reader.readLine()) != null; lineNum++) {
229 			try {
230 				if (line.startsWith("#") || line.length() == 0) {
231 				}
232 				else if (line.startsWith("browser")) {
233 					// this.browser = readTrackLine(line,i);
234 				}
235 				else if (line.startsWith("track")) {
236 					// print track line
237 					BEDHeaderDescription track = readTrackLine(line);
238 					SilkWriter trackNode = out.node("track");
239 					for (BEDHeaderAttribute a : track.attributes) {
240 						trackNode.leaf(a.name, StringUtil.unquote(a.value));
241 					}
242 				}
243 				else {
244 					String dataLine = createGeneTSV(line, lineNum);
245 					// output data line
246 					if (geneCount == 0) {
247 						// print gene header line
248 						SilkWriter geneNode = out.tabDataSchema("gene");
249 						geneNode.attribute("coordinate");
250 						geneNode.attribute("start");
251 						geneNode.attribute("end");
252 						geneNode.attribute("name");
253 						geneNode.attribute("strand");
254 						geneNode.attribute("cds(start, end)");
255 						geneNode.attribute("exon(start, end)*");
256 						geneNode.attribute("color");
257 						geneNode.attribute("_[json]");
258 					}
259 					out.dataLine(dataLine);
260 					geneCount++;
261 				}
262 			}
263 			catch (RecognitionException e) {
264 				_logger.error(String.format("line %d has invalid format: %s", lineNum, e));
265 			}
266 			catch (XerialException e) {
267 				throw new UTGBException(String.format("line %d: %s", lineNum, e));
268 			}
269 			catch (UTGBException e) {
270 				switch (e.getErrorCode()) {
271 				case INVALID_BED_LINE:
272 					_logger.warn(e);
273 					continue;
274 				default:
275 					throw e;
276 				}
277 			}
278 		}
279 
280 		out.endDocument();
281 
282 	}
283 
284 	public String toSilk() throws IOException, UTGBException {
285 		StringWriter out = new StringWriter();
286 		toSilk(new PrintWriter(out));
287 		return out.toString();
288 	}
289 
290 	private static String[] readBEDLine(String line) throws DataFormatException {
291 		String[] temp = line.trim().split("[ \t]+");
292 		// split by tab or space
293 		if (temp.length < 3) {
294 			throw new DataFormatException("Number of line parameters < 3");
295 		}
296 		return temp;
297 	}
298 
299 	private static BEDHeaderDescription readTrackLine(String line) throws IOException, XerialException, RecognitionException {
300 		BEDLexer lexer = new BEDLexer(new ANTLRReaderStream(new StringReader(line)));
301 		CommonTokenStream tokens = new CommonTokenStream(lexer);
302 
303 		BEDParser parser = new BEDParser(tokens);
304 		BEDParser.description_return ret = parser.description();
305 
306 		return Lens.loadANTLRParseTree(BEDHeaderDescription.class, (Tree) ret.getTree(), BEDParser.tokenNames);
307 	}
308 
309 	private static String changeRGB2Hex(String rgb) throws NumberFormatException {
310 		String[] temp = rgb.split(",");
311 		StringBuffer ret = new StringBuffer("\"#");
312 		if (temp.length >= 3) {
313 			for (int i = 0; i < 3; i++) {
314 				Integer tempInt = Integer.parseInt(temp[i]);
315 				if (tempInt > 255 || tempInt < 0) {
316 					System.err.println("Warn : out of color range 0-255");
317 					return "";
318 				}
319 				if (Integer.toHexString(tempInt).length() == 1) {
320 					ret.append("0");
321 				}
322 				ret.append(Integer.toHexString(tempInt));
323 			}
324 			return ret.append("\"").toString();
325 		}
326 		else {
327 			return "";
328 		}
329 	}
330 
331 }