View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2008 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-shell Project
18  //
19  // WIGDatabaseGenerator.java
20  // Since: Nov 20, 2009
21  //
22  // $URL: http://svn.utgenome.org/utgb/trunk/utgb/utgb-shell/src/main/java/org/utgenome/shell/db/wig/WIGDatabaseGenerator.java $ 
23  // $Author: yoshimura $
24  //--------------------------------------
25  package org.utgenome.format.wig;
26  
27  import java.io.BufferedReader;
28  import java.io.ByteArrayOutputStream;
29  import java.io.IOException;
30  import java.io.ObjectOutputStream;
31  import java.io.Reader;
32  import java.io.StringReader;
33  import java.sql.Connection;
34  import java.sql.DriverManager;
35  import java.sql.PreparedStatement;
36  import java.sql.SQLException;
37  import java.sql.Statement;
38  import java.util.ArrayList;
39  import java.util.Arrays;
40  import java.util.zip.DataFormatException;
41  import java.util.zip.GZIPOutputStream;
42  
43  import org.antlr.runtime.ANTLRReaderStream;
44  import org.antlr.runtime.CommonTokenStream;
45  import org.antlr.runtime.RecognitionException;
46  import org.antlr.runtime.tree.Tree;
47  import org.xerial.core.XerialException;
48  import org.xerial.db.DBException;
49  import org.xerial.lens.Lens;
50  import org.xerial.util.StopWatch;
51  import org.xerial.util.log.Logger;
52  
53  /**
54   * Generating SQLite database of WIG data
55   * 
56   * @author yoshimur
57   * 
58   */
59  public class WIGDatabaseGenerator {
60  
61  	private static Logger _logger = Logger.getLogger(WIGDatabaseGenerator.class);
62  	private static StopWatch stopWatch = new StopWatch();
63  
64  	private static CompressedBuffer chromStartBuffer;
65  	private static CompressedBuffer dataValueBuffer;
66  
67  	private static int data_start = 0;
68  	private static int data_step = 0;
69  
70  	private static boolean isVariableStep = true;
71  	private static boolean isAddTrackId = true;
72  	private static boolean isBufferEmpty = true;
73  
74  	private static int buffer_count = 0;
75  	private static long buffer_start = -1;
76  	private static long buffer_end = -1;
77  	private static float buffer_maxValue = Float.MIN_VALUE;
78  	private static float buffer_minValue = Float.MAX_VALUE;
79  
80  	public static final int DATA_SPLIT_UNIT = 100000;
81  	private static int[] chromStarts;
82  	private static float[] dataValues;
83  
84  	public static void toSQLiteDB(Reader wigInput, String dbName) throws IOException, XerialException {
85  		BufferedReader reader = new BufferedReader(wigInput);
86  
87  		int track_id = -1;
88  
89  		chromStartBuffer = new CompressedBuffer();
90  		dataValueBuffer = new CompressedBuffer();
91  
92  		int nPoints = 0;
93  		chromStarts = new int[DATA_SPLIT_UNIT];
94  		dataValues = new float[DATA_SPLIT_UNIT];
95  
96  		String line = null;
97  		int lineNum = 1;
98  
99  		try {
100 			Class.forName("org.sqlite.JDBC");
101 			Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbName);
102 			Statement stat = conn.createStatement();
103 
104 			conn.setAutoCommit(true);
105 			stat.executeUpdate("pragma synchronous=off");
106 			conn.setAutoCommit(false); // begin a single transaction
107 
108 			// prepare the database tables
109 			stat.executeUpdate("drop table if exists browser");
110 			stat.executeUpdate("drop table if exists track");
111 			stat.executeUpdate("drop table if exists data");
112 
113 			stat.executeUpdate("create table browser (description text)");
114 			stat.executeUpdate("create table track (track_id integer, name text, value text)");
115 			stat.executeUpdate("create table data (track_id integer, start integer, end integer, min_value real, "
116 					+ "max_value real, median real, avg real, data_num integer, chrom_starts blob, " + "data_values blob)");
117 
118 			PreparedStatement browserInfoInsertQuery = conn.prepareStatement("insert into browser values(?)");
119 			PreparedStatement trackInsertQuery = conn.prepareStatement("insert into track values(?, ?, ?)");
120 			PreparedStatement dataBlockInsertQuery = conn.prepareStatement("insert into data values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
121 
122 			stopWatch.reset();
123 
124 			// for each WIG File 
125 			while ((line = reader.readLine()) != null) {
126 				if (line.startsWith("#") || line.trim().length() == 0) {
127 				}
128 				else if (line.startsWith("browser")) {
129 					// flush buffer
130 					if (!isBufferEmpty) {
131 						insertData(track_id, dataBlockInsertQuery);
132 						nPoints = 0;
133 					}
134 
135 					// insert browser line
136 					readBrowserLine(browserInfoInsertQuery, line);
137 				}
138 				else if (line.startsWith("track") || line.startsWith("variableStep") || line.startsWith("fixedStep")) {
139 					// flush buffer
140 					if (!isBufferEmpty) {
141 						insertData(track_id, dataBlockInsertQuery);
142 						nPoints = 0;
143 					}
144 
145 					if (isAddTrackId) {
146 						track_id++;
147 						isAddTrackId = false;
148 					}
149 
150 					// insert track line
151 					readHeaderLine(track_id, trackInsertQuery, line);
152 				}
153 				else {
154 					// insert data lines					
155 					isBufferEmpty = false;
156 
157 					if (isVariableStep) {
158 						String[] lineValues = readDataLine(line, lineNum);
159 						int currentPoint = Integer.parseInt(lineValues[0]);
160 						if (buffer_count == 0) {
161 							buffer_start = currentPoint;
162 						}
163 						else {
164 							buffer_end = currentPoint;
165 						}
166 						chromStarts[buffer_count] = currentPoint;
167 						dataValues[buffer_count] = Float.parseFloat(lineValues[1]);
168 					}
169 					else {
170 						String[] lineValues = readDataLine(line, lineNum);
171 						int currentPoint = data_start + (nPoints * data_step);
172 						if (buffer_count == 0) {
173 							buffer_start = currentPoint;
174 						}
175 						else {
176 							buffer_end = currentPoint;
177 						}
178 						dataValues[buffer_count] = Float.parseFloat(lineValues[0]);
179 					}
180 
181 					buffer_maxValue = Math.max(dataValues[buffer_count], buffer_maxValue);
182 					buffer_minValue = Math.min(dataValues[buffer_count], buffer_minValue);
183 
184 					nPoints++;
185 					buffer_count++;
186 
187 					if (buffer_count >= DATA_SPLIT_UNIT) {
188 						insertData(track_id, dataBlockInsertQuery);
189 					}
190 				}
191 				lineNum++;
192 			}
193 
194 			if (!isBufferEmpty) {
195 				insertData(track_id, dataBlockInsertQuery);
196 			}
197 
198 			stat.executeUpdate("create index track_index on track (name, value)");
199 			stat.executeUpdate("create index data_index on data (track_id, start)");
200 
201 			conn.commit();
202 
203 			browserInfoInsertQuery.close();
204 			trackInsertQuery.close();
205 			dataBlockInsertQuery.close();
206 			stat.close();
207 			conn.close();
208 
209 		}
210 		catch (Exception e) {
211 			_logger.error(String.format("line %d: %s", lineNum, e));
212 		}
213 	}
214 
215 	private static void insertData(int track_id, PreparedStatement p3) throws SQLException, IOException {
216 
217 		int[] tempChromStarts = new int[buffer_count];
218 		float[] tempDataValues = new float[buffer_count];
219 
220 		System.arraycopy(chromStarts, 0, tempChromStarts, 0, buffer_count);
221 		System.arraycopy(dataValues, 0, tempDataValues, 0, buffer_count);
222 		ByteArrayOutputStream buf = new ByteArrayOutputStream();
223 		ObjectOutputStream out = new ObjectOutputStream(buf);
224 		if (isVariableStep) {
225 			out.writeObject(tempChromStarts);
226 			out.flush();
227 			chromStartBuffer.write(buf.toByteArray());
228 		}
229 		buf = new ByteArrayOutputStream();
230 		out = new ObjectOutputStream(buf);
231 		out.writeObject(tempDataValues);
232 		out.flush();
233 		dataValueBuffer.write(buf.toByteArray());
234 
235 		// compute median
236 		Arrays.sort(tempDataValues);
237 		float median = tempDataValues[buffer_count / 2];
238 		float sum = 0;
239 		for (float each : tempDataValues)
240 			sum += each;
241 		float avg = sum / buffer_count;
242 
243 		float min = tempDataValues[0];
244 		float max = tempDataValues[tempDataValues.length - 1];
245 
246 		// insert data line
247 		p3.setInt(1, track_id);
248 		p3.setLong(2, buffer_start);
249 		p3.setLong(3, buffer_end);
250 		p3.setFloat(4, min);
251 		p3.setFloat(5, max);
252 		p3.setFloat(6, median);
253 		p3.setFloat(7, avg);
254 		p3.setLong(8, buffer_count);
255 		p3.setBytes(9, chromStartBuffer.toByteArray());
256 		p3.setBytes(10, dataValueBuffer.toByteArray());
257 		p3.execute();
258 
259 		_logger.info(String.format("insert data %d:%d-%d (min:%.2f, max:%.2f, median:%.2f, avg:%.2f)", track_id, buffer_start, buffer_end, min, max, median,
260 				avg));
261 
262 		// init variables
263 		chromStarts = new int[DATA_SPLIT_UNIT];
264 		dataValues = new float[DATA_SPLIT_UNIT];
265 		isAddTrackId = true;
266 		isBufferEmpty = true;
267 		chromStartBuffer.reset();
268 		dataValueBuffer.reset();
269 		buffer_count = 0;
270 		buffer_start = -1;
271 		buffer_end = -1;
272 		buffer_maxValue = Float.MIN_VALUE;
273 		buffer_minValue = Float.MAX_VALUE;
274 	}
275 
276 	private static String[] readDataLine(String line, int lineNum) throws DataFormatException {
277 		String[] temp = line.replace(" ", "\t").trim().split("\t+");
278 		// split by tab or space
279 		if (temp.length > 2) {
280 			throw new DataFormatException("Number of line parameters > 2");
281 		}
282 		return temp;
283 	}
284 
285 	private static void readBrowserLine(PreparedStatement p1, String line) throws SQLException {
286 		p1.setString(1, line);
287 		p1.execute();
288 	}
289 
290 	private static void readHeaderLine(int track_id, PreparedStatement p2, String line) throws IOException, XerialException, RecognitionException,
291 			NumberFormatException, DBException, SQLException {
292 
293 		WIGLexer lexer = new WIGLexer(new ANTLRReaderStream(new StringReader(line)));
294 		CommonTokenStream tokens = new CommonTokenStream(lexer);
295 
296 		WIGParser parser = new WIGParser(tokens);
297 		WIGParser.description_return ret = parser.description();
298 
299 		if (line.startsWith("variableStep")) {
300 			isVariableStep = true;
301 			p2.setInt(1, track_id);
302 			p2.setString(2, "stepType");
303 			p2.setString(3, "variableStep");
304 			p2.execute();
305 		}
306 		else if (line.startsWith("fixedStep")) {
307 			isVariableStep = false;
308 			p2.setInt(1, track_id);
309 			p2.setString(2, "stepType");
310 			p2.setString(3, "fixedStep");
311 			p2.execute();
312 		}
313 
314 		for (WIGHeaderAttribute a : Lens.loadANTLRParseTree(WIGHeaderDescription.class, (Tree) ret.getTree(), WIGParser.tokenNames).attributes) {
315 			if (a.name.equals("start")) {
316 				data_start = Integer.parseInt(a.value);
317 			}
318 			else if (a.name.equals("step")) {
319 				data_step = Integer.parseInt(a.value);
320 			}
321 
322 			p2.setInt(1, track_id);
323 			p2.setString(2, a.name);
324 			p2.setString(3, a.value);
325 			p2.execute();
326 		}
327 	}
328 
329 	public static class WIGHeaderDescription {
330 		String name;
331 		ArrayList<WIGHeaderAttribute> attributes = new ArrayList<WIGHeaderAttribute>();
332 
333 		public void setName(String name) {
334 			this.name = name;
335 		}
336 
337 		public void addAttribute(WIGHeaderAttribute attribute) {
338 			attributes.add(attribute);
339 		}
340 
341 		@Override
342 		public String toString() {
343 			return String.format("name=%s, attributes=%s", name, attributes.toString());
344 		}
345 	}
346 
347 	public static class WIGHeaderAttribute {
348 		String name;
349 		String value;
350 
351 		public void setName(String name) {
352 			this.name = name;
353 		}
354 
355 		public void setValue(String value) {
356 			this.value = value;
357 		}
358 
359 		@Override
360 		public String toString() {
361 			return String.format("{name=%s, value=%s}", name, value);
362 		}
363 	}
364 
365 	static class CompressedBuffer {
366 		private ByteArrayOutputStream buf;
367 		private GZIPOutputStream compressor;
368 		private int writtenSize;
369 
370 		public CompressedBuffer() throws IOException {
371 			reset();
372 		}
373 
374 		public int writtenSize() {
375 			return writtenSize;
376 		}
377 
378 		public void write(byte[] data) throws IOException {
379 			compressor.write(data);
380 			writtenSize += data.length;
381 		}
382 
383 		public byte[] toByteArray() throws IOException {
384 			compressor.finish();
385 			byte[] ret = buf.toByteArray();
386 			return ret;
387 		}
388 
389 		public void reset() throws IOException {
390 			buf = new ByteArrayOutputStream();
391 			compressor = new GZIPOutputStream(buf);
392 			writtenSize = 0;
393 		}
394 	}
395 }