View Javadoc

1   //--------------------------------------
2   // base-color Project
3   //
4   // FastaToDBGenerator.java
5   // Since: Jan 25, 2008
6   //
7   // $URL: http://svn.utgenome.org/utgb/trunk/utgb/utgb-tracks/base-color/src/main/java/org/utgenome/track/basecolor/FastaToDBGenerator.java $ 
8   // $Author: leo $
9   //--------------------------------------
10  package org.utgenome.format.fasta;
11  
12  import java.io.BufferedReader;
13  import java.io.ByteArrayInputStream;
14  import java.io.ByteArrayOutputStream;
15  import java.io.File;
16  import java.io.FileReader;
17  import java.io.IOException;
18  import java.io.InputStreamReader;
19  import java.io.Reader;
20  import java.sql.PreparedStatement;
21  import java.sql.SQLException;
22  import java.util.zip.GZIPInputStream;
23  import java.util.zip.GZIPOutputStream;
24  
25  import org.utgenome.UTGBErrorCode;
26  import org.utgenome.UTGBException;
27  import org.utgenome.gwt.utgb.client.bio.ChrLoc;
28  import org.xerial.db.DBException;
29  import org.xerial.db.sql.BeanResultHandler;
30  import org.xerial.db.sql.PreparedStatementHandler;
31  import org.xerial.db.sql.SQLExpression;
32  import org.xerial.db.sql.SQLUtil;
33  import org.xerial.db.sql.sqlite.SQLiteAccess;
34  import org.xerial.util.StopWatch;
35  import org.xerial.util.log.Logger;
36  import org.xerial.util.opt.Argument;
37  import org.xerial.util.opt.Option;
38  import org.xerial.util.opt.OptionParser;
39  
40  /**
41   * Fasta2DB creates a SQLite database file from a given FASTA format file
42   * 
43   * @author leo
44   * 
45   */
46  public class FASTADatabase {
47  	private static Logger _logger = Logger.getLogger(FASTADatabase.class);
48  
49  	public static class Config {
50  		@Option(symbol = "h", longName = "help", description = "display help message")
51  		public boolean dispalyHelp = false;
52  		@Option(symbol = "o", longName = "out", varName = "DBNAME", description = "output SQLite database file name")
53  		public String outputDBName = null;
54  
55  		@Argument(index = 0, required = false, name = "input FASTA file")
56  		public String inputFASTAFile = null;
57  	}
58  
59  	static class CompressedBuffer {
60  		private ByteArrayOutputStream buf;
61  		private GZIPOutputStream compressor;
62  		private int writtenSize;
63  
64  		public CompressedBuffer() throws IOException {
65  			reset();
66  		}
67  
68  		public int writtenSize() {
69  			return writtenSize;
70  		}
71  
72  		public void write(byte[] data) throws IOException {
73  			compressor.write(data);
74  			writtenSize += data.length;
75  		}
76  
77  		public byte[] toByteArray() throws IOException {
78  			compressor.finish();
79  			byte[] ret = buf.toByteArray();
80  			return ret;
81  		}
82  
83  		public void reset() throws IOException {
84  			buf = new ByteArrayOutputStream();
85  			compressor = new GZIPOutputStream(buf);
86  			writtenSize = 0;
87  		}
88  
89  	}
90  
91  	private static final int SEQUENCE_FRAGMENT_LENGTH = 100000;
92  
93  	public void createDB(Reader fasta, SQLiteAccess db) throws Exception {
94  		createDB(new FASTAPullParser(fasta), db);
95  	}
96  
97  	public void createDB(FASTAPullParser pullParser, SQLiteAccess db) throws Exception {
98  		try {
99  			db.setAutoCommit(true);
100 			db.update("pragma synchronous=off");
101 			db.setAutoCommit(false); // begin a single transaction
102 
103 			// prepare the database tables
104 			db.update("drop table if exists description");
105 			db.update("drop table if exists sequence");
106 			db.update("drop table if exists sequence_length");
107 			db.update("create table description (id integer primary key not null, description string, fullDesc string)");
108 			db.update("create index description_index on description(description)");
109 			db.update("create table sequence (description_id integer not null, start integer, end integer, sequence string, primary key (description_id, start))");
110 			db.update("create table sequence_length (description_id integer primary_key not null, length integer)");
111 
112 			stopWatch.reset();
113 			// load the FASTA file
114 			int count = 1;
115 			String description = null;
116 
117 			// for each FASTA entry 
118 			while ((description = pullParser.nextDescriptionLine()) != null) {
119 				long start = 1;
120 				long end = 1;
121 
122 				String chr = CompactFASTA.pickSequenceName(description);
123 				_logger.info("new entry: " + chr);
124 
125 				db.update(SQLExpression.fillTemplate("insert into description values($1, $2, $3)", count, SQLUtil.singleQuote(chr),
126 						SQLUtil.singleQuote(description)));
127 
128 				CompressedBuffer buffer = new CompressedBuffer();
129 
130 				// for each sequence line
131 				String seq = null;
132 				while ((seq = pullParser.nextSequenceLine()) != null) {
133 					final int seqLen = seq.length();
134 					for (int cursor = 0; cursor < seqLen;) {
135 						final int storedLen = buffer.writtenSize();
136 						final int fragmentSize = storedLen + seqLen - cursor;
137 						if (fragmentSize > SEQUENCE_FRAGMENT_LENGTH) {
138 							int seqEnd = cursor + SEQUENCE_FRAGMENT_LENGTH - storedLen;
139 							buffer.write(seq.substring(cursor, seqEnd).getBytes());
140 							end = start + buffer.writtenSize() - 1;
141 							insertSequence(db, count, start, end, buffer.toByteArray());
142 							start = end + 1;
143 							buffer.reset();
144 							cursor = seqEnd;
145 						}
146 						else {
147 							buffer.write(cursor == 0 ? seq.getBytes() : seq.substring(cursor).getBytes());
148 							cursor = seq.length();
149 						}
150 					}
151 					//					
152 					//					
153 					//					if (storedLen + seq.length() >= SEQUENCE_FRAGMENT_LENGTH) {
154 					//						
155 					//						
156 					//						int remainingSize = SEQUENCE_FRAGMENT_LENGTH - storedLen;
157 					//						buffer.write(seq.substring(0, remainingSize).getBytes());
158 					//
159 					//						end = start + buffer.writtenSize() - 1;
160 					//						insertSequence(db, count, start, end, buffer.toByteArray());
161 					//						start = end + 1;
162 					//
163 					//						buffer.reset();
164 					//						buffer.write(seq.substring(remainingSize).getBytes());
165 					//					}
166 					//					else {
167 					//						buffer.write(seq.getBytes());
168 					//					}
169 				}
170 				if (buffer.writtenSize() > 0) {
171 					end = start + buffer.writtenSize() - 1;
172 					insertSequence(db, count, start, end, buffer.toByteArray());
173 					start = end + 1;
174 				}
175 
176 				// set sequence_length
177 				db.update(SQLExpression.fillTemplate("insert into sequence_length values($1, $2)", count, end));
178 
179 				count++;
180 			}
181 		}
182 		catch (DBException e) {
183 			_logger.error(e);
184 		}
185 		finally {
186 			db.update("commit");
187 
188 		}
189 
190 		_logger.info("done.");
191 
192 	}
193 
194 	public static void main(String[] args) throws Exception {
195 
196 		Config conf = new Config();
197 		OptionParser optionParser = new OptionParser(conf);
198 
199 		optionParser.parse(args);
200 
201 		if (conf.dispalyHelp) {
202 			optionParser.printUsage();
203 			return;
204 		}
205 
206 		Reader input = null;
207 		String fastaName = null;
208 		if (conf.inputFASTAFile != null) {
209 			File fastaFile = new File(conf.inputFASTAFile);
210 			_logger.info("fasta file: " + fastaFile);
211 			if (!fastaFile.exists())
212 				throw new Exception(fastaFile.getName() + " does not exist");
213 			input = new BufferedReader(new FileReader(fastaFile));
214 			fastaName = fastaFile.getName();
215 		}
216 		else {
217 			input = new InputStreamReader(System.in);
218 			fastaName = "out";
219 		}
220 
221 		assert (fastaName != null);
222 
223 		String dbName = conf.outputDBName != null ? conf.outputDBName : fastaName + ".db";
224 		_logger.info("output sqlite db file: " + dbName);
225 
226 		FASTADatabase p = new FASTADatabase();
227 		SQLiteAccess db = new SQLiteAccess(dbName);
228 		p.createDB(input, db);
229 		db.dispose();
230 	}
231 
232 	private static int insertCount = 0;
233 	private static StopWatch stopWatch = new StopWatch();
234 
235 	private static void insertSequence(SQLiteAccess db, int descriptionID, long start, long end, byte[] sequence) throws DBException {
236 
237 		db.updateWithPreparedStatement(SQLExpression.fillTemplate("insert into sequence values($1, $2, $3, ?)", descriptionID, start, end), new SequenceSetter(
238 				sequence));
239 
240 		insertCount++;
241 		if ((insertCount % 10000) == 0) {
242 			_logger.info("inserted " + insertCount + "\t" + stopWatch.getElapsedTime() + " sec.");
243 		}
244 	}
245 
246 	/**
247 	 * For setting a byte array to the given prepared statement
248 	 * 
249 	 * @author leo
250 	 * 
251 	 */
252 	static class SequenceSetter implements PreparedStatementHandler {
253 		private final byte[] sequence;
254 
255 		public SequenceSetter(byte[] sequence) {
256 			this.sequence = sequence;
257 		}
258 
259 		public void setup(PreparedStatement preparedStatement) throws SQLException {
260 			preparedStatement.setBytes(1, sequence);
261 
262 		}
263 
264 	}
265 
266 	public static void querySequence(File dbFile, ChrLoc location, BeanResultHandler<NSeq> handler) throws UTGBException {
267 		if (!dbFile.exists())
268 			throw new UTGBException(UTGBErrorCode.MISSING_FILES, "DB file doesn't exist: " + dbFile);
269 
270 		SQLiteAccess db = null;
271 		try {
272 			try {
273 				db = new SQLiteAccess(dbFile.getAbsolutePath());
274 				querySequence(db, location, handler);
275 			}
276 			finally {
277 				if (db != null)
278 					db.dispose();
279 			}
280 		}
281 		catch (Exception e) {
282 			throw UTGBException.convert(e);
283 		}
284 
285 	}
286 
287 	public static void querySequence(SQLiteAccess db, ChrLoc location, BeanResultHandler<NSeq> handler) throws UTGBException {
288 
289 		try {
290 			int start = location.viewStart();
291 			int end = location.viewEnd();
292 			int searchStart = (start / SEQUENCE_FRAGMENT_LENGTH) * SEQUENCE_FRAGMENT_LENGTH + 1;
293 
294 			String sql = SQLExpression.fillTemplate("select start, end, sequence from " + "(select * from description where description= '$1') as description "
295 					+ "join sequence on sequence.description_id = description.id " + "where start between $2 and $3 " + "and end > $4 order by start",
296 					location.chr, searchStart, end, start);
297 
298 			db.query(sql, NSeq.class, handler);
299 		}
300 		catch (Exception e) {
301 			throw UTGBException.convert(e);
302 		}
303 
304 	}
305 
306 	/**
307 	 * A holder for retrieving compressed genome sequence
308 	 * 
309 	 * @author leo
310 	 * 
311 	 */
312 	public static class NSeq {
313 		private int start;
314 		private int end;
315 		private byte[] sequence;
316 
317 		public NSeq() {
318 		}
319 
320 		public NSeq(int start, int end, byte[] sequence) {
321 			this.start = start;
322 			this.end = end;
323 			this.sequence = sequence;
324 		}
325 
326 		@Override
327 		public String toString() {
328 			StringBuilder buf = new StringBuilder();
329 			buf.append("(");
330 			buf.append(start);
331 			buf.append(",");
332 			buf.append(end);
333 			buf.append(")");
334 			return buf.toString();
335 		}
336 
337 		public void setStart(int start) {
338 			this.start = start;
339 		}
340 
341 		public void setEnd(int end) {
342 			this.end = end;
343 		}
344 
345 		public int getStart() {
346 			return start;
347 		}
348 
349 		public int getEnd() {
350 			return end;
351 		}
352 
353 		/**
354 		 * extract sub sequence (0-origin)
355 		 * 
356 		 * @param bufStart
357 		 * @param bufEnd
358 		 * @return
359 		 */
360 		public String getSubSequence(int bufStart, int bufEnd) {
361 			return new String(sequence, bufStart, bufEnd - bufStart);
362 		}
363 
364 		public int getLength() {
365 			return sequence.length;
366 		}
367 
368 		public byte[] getSequence() {
369 			return sequence;
370 		}
371 
372 		public void setSequence(byte[] sequence) throws IOException {
373 			GZIPInputStream decompressor = new GZIPInputStream(new ByteArrayInputStream(sequence));
374 			ByteArrayOutputStream b = new ByteArrayOutputStream();
375 			byte[] buf = new byte[8192];
376 			int readBytes = 0;
377 			while ((readBytes = decompressor.read(buf)) != -1) {
378 				b.write(buf, 0, readBytes);
379 			}
380 			this.sequence = b.toByteArray();
381 		}
382 
383 	}
384 
385 }