1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.bed;
26
27 import java.io.BufferedReader;
28 import java.io.File;
29 import java.io.FileReader;
30 import java.io.IOException;
31 import java.io.PrintWriter;
32 import java.io.Reader;
33 import java.io.StringReader;
34 import java.io.StringWriter;
35 import java.util.ArrayList;
36 import java.util.zip.DataFormatException;
37
38 import org.antlr.runtime.ANTLRReaderStream;
39 import org.antlr.runtime.CommonTokenStream;
40 import org.antlr.runtime.RecognitionException;
41 import org.antlr.runtime.tree.Tree;
42 import org.utgenome.UTGBErrorCode;
43 import org.utgenome.UTGBException;
44 import org.utgenome.gwt.utgb.client.util.StringUtil;
45 import org.xerial.core.XerialException;
46 import org.xerial.lens.Lens;
47 import org.xerial.silk.SilkWriter;
48 import org.xerial.util.log.Logger;
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63 public class BED2Silk {
64
65 private static Logger _logger = Logger.getLogger(BED2Silk.class);
66 private final BufferedReader reader;
67
68 public static class BEDHeaderDescription {
69 String name;
70 ArrayList<BEDHeaderAttribute> attributes = new ArrayList<BEDHeaderAttribute>();
71
72 public void setName(String name) {
73 this.name = name;
74 }
75
76 public void addAttribute(BEDHeaderAttribute attribute) {
77 attributes.add(attribute);
78 }
79
80 @Override
81 public String toString() {
82 return String.format("name=%s, attributes=%s", name, attributes.toString());
83 }
84 }
85
86 public static class BEDHeaderAttribute {
87 String name;
88 String value;
89
90 public void setName(String name) {
91 this.name = name;
92 }
93
94 public void setValue(String value) {
95 this.value = value;
96 }
97
98 @Override
99 public String toString() {
100 return String.format("{name=%s, value=%s}", name, value);
101 }
102 }
103
104 public BED2Silk(File bedFile) throws IOException {
105 this(new FileReader(bedFile));
106 }
107
108
109
110
111
112
113 public BED2Silk(Reader bedFile) throws IOException {
114 this.reader = new BufferedReader(bedFile);
115
116 }
117
118 public void close() throws IOException {
119 if (reader != null)
120 reader.close();
121 }
122
123
124
125
126
127
128
129
130
131
132 private String createGeneTSV(String line, int lineNum) throws UTGBException {
133
134 try {
135 String[] gene = readBEDLine(line);
136 StringBuilder sb = new StringBuilder();
137 if (gene.length < 3) {
138 throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d doesn't have 3 columns: %s", lineNum, line));
139 }
140
141 int start = Integer.parseInt(gene[1]) + 1;
142 int end = Integer.parseInt(gene[2]) + 1;
143
144
145 sb.append(String.format("%s\t%d\t%d\t", gene[0], start, end));
146
147 if (gene.length >= 4) {
148 sb.append(gene[3]);
149 }
150
151 sb.append("\t");
152 if (gene.length >= 6) {
153 if (gene[5].equals("+") || gene[5].equals("-")) {
154 sb.append(gene[5]);
155 }
156 else {
157 _logger.warn(String.format("Illegal strand value '%s'. Using '+' instead. ", gene[5]));
158 sb.append("+");
159 }
160 }
161
162 sb.append("\t");
163 if (gene.length >= 8) {
164 int cdsStart = Integer.parseInt(gene[6]) + 1;
165 int cdsEnd = Integer.parseInt(gene[7]) + 1;
166 sb.append(String.format("[%d, %d]", cdsStart, cdsEnd));
167 }
168
169 sb.append("\t");
170 if (gene.length >= 12) {
171 String[] blockSizes = gene[10].split(",");
172 String[] blockStarts = gene[11].split(",");
173
174 sb.append("[");
175 Integer nExons = Integer.parseInt(gene[9]);
176 for (int i = 0; i < nExons; i++) {
177 int startExon = start + Integer.parseInt(blockStarts[i]);
178 int endExon = startExon + Integer.parseInt(blockSizes[i]);
179 sb.append("[" + startExon + ", " + endExon + "]");
180 if (i < nExons - 1) {
181 sb.append(", ");
182 }
183 }
184 sb.append("]");
185 }
186
187
188 sb.append("\t");
189 if (gene.length >= 9) {
190 sb.append(changeRGB2Hex(gene[8]));
191 }
192
193 sb.append("\t");
194 if (gene.length >= 5) {
195 sb.append("{\"score\":" + gene[4] + "}");
196 }
197
198 return sb.toString();
199 }
200 catch (NumberFormatException e) {
201 throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d: %s", lineNum, e));
202 }
203 catch (DataFormatException e) {
204 throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d: %s", lineNum, e));
205 }
206 catch (IllegalArgumentException e) {
207 throw new UTGBException(UTGBErrorCode.INVALID_BED_LINE, String.format("line %d: %s", lineNum, e));
208 }
209
210 }
211
212
213
214
215
216
217
218 public void toSilk(PrintWriter pout) throws IOException, UTGBException {
219
220 SilkWriter out = new SilkWriter(pout);
221
222
223 out.preamble();
224
225 int geneCount = 0;
226
227 int lineNum = 1;
228 for (String line; (line = reader.readLine()) != null; lineNum++) {
229 try {
230 if (line.startsWith("#") || line.length() == 0) {
231 }
232 else if (line.startsWith("browser")) {
233
234 }
235 else if (line.startsWith("track")) {
236
237 BEDHeaderDescription track = readTrackLine(line);
238 SilkWriter trackNode = out.node("track");
239 for (BEDHeaderAttribute a : track.attributes) {
240 trackNode.leaf(a.name, StringUtil.unquote(a.value));
241 }
242 }
243 else {
244 String dataLine = createGeneTSV(line, lineNum);
245
246 if (geneCount == 0) {
247
248 SilkWriter geneNode = out.tabDataSchema("gene");
249 geneNode.attribute("coordinate");
250 geneNode.attribute("start");
251 geneNode.attribute("end");
252 geneNode.attribute("name");
253 geneNode.attribute("strand");
254 geneNode.attribute("cds(start, end)");
255 geneNode.attribute("exon(start, end)*");
256 geneNode.attribute("color");
257 geneNode.attribute("_[json]");
258 }
259 out.dataLine(dataLine);
260 geneCount++;
261 }
262 }
263 catch (RecognitionException e) {
264 _logger.error(String.format("line %d has invalid format: %s", lineNum, e));
265 }
266 catch (XerialException e) {
267 throw new UTGBException(String.format("line %d: %s", lineNum, e));
268 }
269 catch (UTGBException e) {
270 switch (e.getErrorCode()) {
271 case INVALID_BED_LINE:
272 _logger.warn(e);
273 continue;
274 default:
275 throw e;
276 }
277 }
278 }
279
280 out.endDocument();
281
282 }
283
284 public String toSilk() throws IOException, UTGBException {
285 StringWriter out = new StringWriter();
286 toSilk(new PrintWriter(out));
287 return out.toString();
288 }
289
290 private static String[] readBEDLine(String line) throws DataFormatException {
291 String[] temp = line.trim().split("[ \t]+");
292
293 if (temp.length < 3) {
294 throw new DataFormatException("Number of line parameters < 3");
295 }
296 return temp;
297 }
298
299 private static BEDHeaderDescription readTrackLine(String line) throws IOException, XerialException, RecognitionException {
300 BEDLexer lexer = new BEDLexer(new ANTLRReaderStream(new StringReader(line)));
301 CommonTokenStream tokens = new CommonTokenStream(lexer);
302
303 BEDParser parser = new BEDParser(tokens);
304 BEDParser.description_return ret = parser.description();
305
306 return Lens.loadANTLRParseTree(BEDHeaderDescription.class, (Tree) ret.getTree(), BEDParser.tokenNames);
307 }
308
309 private static String changeRGB2Hex(String rgb) throws NumberFormatException {
310 String[] temp = rgb.split(",");
311 StringBuffer ret = new StringBuffer("\"#");
312 if (temp.length >= 3) {
313 for (int i = 0; i < 3; i++) {
314 Integer tempInt = Integer.parseInt(temp[i]);
315 if (tempInt > 255 || tempInt < 0) {
316 System.err.println("Warn : out of color range 0-255");
317 return "";
318 }
319 if (Integer.toHexString(tempInt).length() == 1) {
320 ret.append("0");
321 }
322 ret.append(Integer.toHexString(tempInt));
323 }
324 return ret.append("\"").toString();
325 }
326 else {
327 return "";
328 }
329 }
330
331 }