1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.wig;
26
27 import java.io.BufferedReader;
28 import java.io.ByteArrayOutputStream;
29 import java.io.IOException;
30 import java.io.ObjectOutputStream;
31 import java.io.Reader;
32 import java.io.StringReader;
33 import java.sql.Connection;
34 import java.sql.DriverManager;
35 import java.sql.PreparedStatement;
36 import java.sql.SQLException;
37 import java.sql.Statement;
38 import java.util.ArrayList;
39 import java.util.Arrays;
40 import java.util.zip.DataFormatException;
41 import java.util.zip.GZIPOutputStream;
42
43 import org.antlr.runtime.ANTLRReaderStream;
44 import org.antlr.runtime.CommonTokenStream;
45 import org.antlr.runtime.RecognitionException;
46 import org.antlr.runtime.tree.Tree;
47 import org.xerial.core.XerialException;
48 import org.xerial.db.DBException;
49 import org.xerial.lens.Lens;
50 import org.xerial.util.StopWatch;
51 import org.xerial.util.log.Logger;
52
53
54
55
56
57
58
59 public class WIGDatabaseGenerator {
60
61 private static Logger _logger = Logger.getLogger(WIGDatabaseGenerator.class);
62 private static StopWatch stopWatch = new StopWatch();
63
64 private static CompressedBuffer chromStartBuffer;
65 private static CompressedBuffer dataValueBuffer;
66
67 private static int data_start = 0;
68 private static int data_step = 0;
69
70 private static boolean isVariableStep = true;
71 private static boolean isAddTrackId = true;
72 private static boolean isBufferEmpty = true;
73
74 private static int buffer_count = 0;
75 private static long buffer_start = -1;
76 private static long buffer_end = -1;
77 private static float buffer_maxValue = Float.MIN_VALUE;
78 private static float buffer_minValue = Float.MAX_VALUE;
79
80 public static final int DATA_SPLIT_UNIT = 100000;
81 private static int[] chromStarts;
82 private static float[] dataValues;
83
84 public static void toSQLiteDB(Reader wigInput, String dbName) throws IOException, XerialException {
85 BufferedReader reader = new BufferedReader(wigInput);
86
87 int track_id = -1;
88
89 chromStartBuffer = new CompressedBuffer();
90 dataValueBuffer = new CompressedBuffer();
91
92 int nPoints = 0;
93 chromStarts = new int[DATA_SPLIT_UNIT];
94 dataValues = new float[DATA_SPLIT_UNIT];
95
96 String line = null;
97 int lineNum = 1;
98
99 try {
100 Class.forName("org.sqlite.JDBC");
101 Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbName);
102 Statement stat = conn.createStatement();
103
104 conn.setAutoCommit(true);
105 stat.executeUpdate("pragma synchronous=off");
106 conn.setAutoCommit(false);
107
108
109 stat.executeUpdate("drop table if exists browser");
110 stat.executeUpdate("drop table if exists track");
111 stat.executeUpdate("drop table if exists data");
112
113 stat.executeUpdate("create table browser (description text)");
114 stat.executeUpdate("create table track (track_id integer, name text, value text)");
115 stat.executeUpdate("create table data (track_id integer, start integer, end integer, min_value real, "
116 + "max_value real, median real, avg real, data_num integer, chrom_starts blob, " + "data_values blob)");
117
118 PreparedStatement browserInfoInsertQuery = conn.prepareStatement("insert into browser values(?)");
119 PreparedStatement trackInsertQuery = conn.prepareStatement("insert into track values(?, ?, ?)");
120 PreparedStatement dataBlockInsertQuery = conn.prepareStatement("insert into data values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
121
122 stopWatch.reset();
123
124
125 while ((line = reader.readLine()) != null) {
126 if (line.startsWith("#") || line.trim().length() == 0) {
127 }
128 else if (line.startsWith("browser")) {
129
130 if (!isBufferEmpty) {
131 insertData(track_id, dataBlockInsertQuery);
132 nPoints = 0;
133 }
134
135
136 readBrowserLine(browserInfoInsertQuery, line);
137 }
138 else if (line.startsWith("track") || line.startsWith("variableStep") || line.startsWith("fixedStep")) {
139
140 if (!isBufferEmpty) {
141 insertData(track_id, dataBlockInsertQuery);
142 nPoints = 0;
143 }
144
145 if (isAddTrackId) {
146 track_id++;
147 isAddTrackId = false;
148 }
149
150
151 readHeaderLine(track_id, trackInsertQuery, line);
152 }
153 else {
154
155 isBufferEmpty = false;
156
157 if (isVariableStep) {
158 String[] lineValues = readDataLine(line, lineNum);
159 int currentPoint = Integer.parseInt(lineValues[0]);
160 if (buffer_count == 0) {
161 buffer_start = currentPoint;
162 }
163 else {
164 buffer_end = currentPoint;
165 }
166 chromStarts[buffer_count] = currentPoint;
167 dataValues[buffer_count] = Float.parseFloat(lineValues[1]);
168 }
169 else {
170 String[] lineValues = readDataLine(line, lineNum);
171 int currentPoint = data_start + (nPoints * data_step);
172 if (buffer_count == 0) {
173 buffer_start = currentPoint;
174 }
175 else {
176 buffer_end = currentPoint;
177 }
178 dataValues[buffer_count] = Float.parseFloat(lineValues[0]);
179 }
180
181 buffer_maxValue = Math.max(dataValues[buffer_count], buffer_maxValue);
182 buffer_minValue = Math.min(dataValues[buffer_count], buffer_minValue);
183
184 nPoints++;
185 buffer_count++;
186
187 if (buffer_count >= DATA_SPLIT_UNIT) {
188 insertData(track_id, dataBlockInsertQuery);
189 }
190 }
191 lineNum++;
192 }
193
194 if (!isBufferEmpty) {
195 insertData(track_id, dataBlockInsertQuery);
196 }
197
198 stat.executeUpdate("create index track_index on track (name, value)");
199 stat.executeUpdate("create index data_index on data (track_id, start)");
200
201 conn.commit();
202
203 browserInfoInsertQuery.close();
204 trackInsertQuery.close();
205 dataBlockInsertQuery.close();
206 stat.close();
207 conn.close();
208
209 }
210 catch (Exception e) {
211 _logger.error(String.format("line %d: %s", lineNum, e));
212 }
213 }
214
215 private static void insertData(int track_id, PreparedStatement p3) throws SQLException, IOException {
216
217 int[] tempChromStarts = new int[buffer_count];
218 float[] tempDataValues = new float[buffer_count];
219
220 System.arraycopy(chromStarts, 0, tempChromStarts, 0, buffer_count);
221 System.arraycopy(dataValues, 0, tempDataValues, 0, buffer_count);
222 ByteArrayOutputStream buf = new ByteArrayOutputStream();
223 ObjectOutputStream out = new ObjectOutputStream(buf);
224 if (isVariableStep) {
225 out.writeObject(tempChromStarts);
226 out.flush();
227 chromStartBuffer.write(buf.toByteArray());
228 }
229 buf = new ByteArrayOutputStream();
230 out = new ObjectOutputStream(buf);
231 out.writeObject(tempDataValues);
232 out.flush();
233 dataValueBuffer.write(buf.toByteArray());
234
235
236 Arrays.sort(tempDataValues);
237 float median = tempDataValues[buffer_count / 2];
238 float sum = 0;
239 for (float each : tempDataValues)
240 sum += each;
241 float avg = sum / buffer_count;
242
243 float min = tempDataValues[0];
244 float max = tempDataValues[tempDataValues.length - 1];
245
246
247 p3.setInt(1, track_id);
248 p3.setLong(2, buffer_start);
249 p3.setLong(3, buffer_end);
250 p3.setFloat(4, min);
251 p3.setFloat(5, max);
252 p3.setFloat(6, median);
253 p3.setFloat(7, avg);
254 p3.setLong(8, buffer_count);
255 p3.setBytes(9, chromStartBuffer.toByteArray());
256 p3.setBytes(10, dataValueBuffer.toByteArray());
257 p3.execute();
258
259 _logger.info(String.format("insert data %d:%d-%d (min:%.2f, max:%.2f, median:%.2f, avg:%.2f)", track_id, buffer_start, buffer_end, min, max, median,
260 avg));
261
262
263 chromStarts = new int[DATA_SPLIT_UNIT];
264 dataValues = new float[DATA_SPLIT_UNIT];
265 isAddTrackId = true;
266 isBufferEmpty = true;
267 chromStartBuffer.reset();
268 dataValueBuffer.reset();
269 buffer_count = 0;
270 buffer_start = -1;
271 buffer_end = -1;
272 buffer_maxValue = Float.MIN_VALUE;
273 buffer_minValue = Float.MAX_VALUE;
274 }
275
276 private static String[] readDataLine(String line, int lineNum) throws DataFormatException {
277 String[] temp = line.replace(" ", "\t").trim().split("\t+");
278
279 if (temp.length > 2) {
280 throw new DataFormatException("Number of line parameters > 2");
281 }
282 return temp;
283 }
284
285 private static void readBrowserLine(PreparedStatement p1, String line) throws SQLException {
286 p1.setString(1, line);
287 p1.execute();
288 }
289
290 private static void readHeaderLine(int track_id, PreparedStatement p2, String line) throws IOException, XerialException, RecognitionException,
291 NumberFormatException, DBException, SQLException {
292
293 WIGLexer lexer = new WIGLexer(new ANTLRReaderStream(new StringReader(line)));
294 CommonTokenStream tokens = new CommonTokenStream(lexer);
295
296 WIGParser parser = new WIGParser(tokens);
297 WIGParser.description_return ret = parser.description();
298
299 if (line.startsWith("variableStep")) {
300 isVariableStep = true;
301 p2.setInt(1, track_id);
302 p2.setString(2, "stepType");
303 p2.setString(3, "variableStep");
304 p2.execute();
305 }
306 else if (line.startsWith("fixedStep")) {
307 isVariableStep = false;
308 p2.setInt(1, track_id);
309 p2.setString(2, "stepType");
310 p2.setString(3, "fixedStep");
311 p2.execute();
312 }
313
314 for (WIGHeaderAttribute a : Lens.loadANTLRParseTree(WIGHeaderDescription.class, (Tree) ret.getTree(), WIGParser.tokenNames).attributes) {
315 if (a.name.equals("start")) {
316 data_start = Integer.parseInt(a.value);
317 }
318 else if (a.name.equals("step")) {
319 data_step = Integer.parseInt(a.value);
320 }
321
322 p2.setInt(1, track_id);
323 p2.setString(2, a.name);
324 p2.setString(3, a.value);
325 p2.execute();
326 }
327 }
328
329 public static class WIGHeaderDescription {
330 String name;
331 ArrayList<WIGHeaderAttribute> attributes = new ArrayList<WIGHeaderAttribute>();
332
333 public void setName(String name) {
334 this.name = name;
335 }
336
337 public void addAttribute(WIGHeaderAttribute attribute) {
338 attributes.add(attribute);
339 }
340
341 @Override
342 public String toString() {
343 return String.format("name=%s, attributes=%s", name, attributes.toString());
344 }
345 }
346
347 public static class WIGHeaderAttribute {
348 String name;
349 String value;
350
351 public void setName(String name) {
352 this.name = name;
353 }
354
355 public void setValue(String value) {
356 this.value = value;
357 }
358
359 @Override
360 public String toString() {
361 return String.format("{name=%s, value=%s}", name, value);
362 }
363 }
364
365 static class CompressedBuffer {
366 private ByteArrayOutputStream buf;
367 private GZIPOutputStream compressor;
368 private int writtenSize;
369
370 public CompressedBuffer() throws IOException {
371 reset();
372 }
373
374 public int writtenSize() {
375 return writtenSize;
376 }
377
378 public void write(byte[] data) throws IOException {
379 compressor.write(data);
380 writtenSize += data.length;
381 }
382
383 public byte[] toByteArray() throws IOException {
384 compressor.finish();
385 byte[] ret = buf.toByteArray();
386 return ret;
387 }
388
389 public void reset() throws IOException {
390 buf = new ByteArrayOutputStream();
391 compressor = new GZIPOutputStream(buf);
392 writtenSize = 0;
393 }
394 }
395 }