1
2
3
4
5
6
7
8
9
10 package org.utgenome.format.fasta;
11
12 import java.io.BufferedReader;
13 import java.io.ByteArrayInputStream;
14 import java.io.ByteArrayOutputStream;
15 import java.io.File;
16 import java.io.FileReader;
17 import java.io.IOException;
18 import java.io.InputStreamReader;
19 import java.io.Reader;
20 import java.sql.PreparedStatement;
21 import java.sql.SQLException;
22 import java.util.zip.GZIPInputStream;
23 import java.util.zip.GZIPOutputStream;
24
25 import org.utgenome.UTGBErrorCode;
26 import org.utgenome.UTGBException;
27 import org.utgenome.gwt.utgb.client.bio.ChrLoc;
28 import org.xerial.db.DBException;
29 import org.xerial.db.sql.BeanResultHandler;
30 import org.xerial.db.sql.PreparedStatementHandler;
31 import org.xerial.db.sql.SQLExpression;
32 import org.xerial.db.sql.SQLUtil;
33 import org.xerial.db.sql.sqlite.SQLiteAccess;
34 import org.xerial.util.StopWatch;
35 import org.xerial.util.log.Logger;
36 import org.xerial.util.opt.Argument;
37 import org.xerial.util.opt.Option;
38 import org.xerial.util.opt.OptionParser;
39
40
41
42
43
44
45
46 public class FASTADatabase {
47 private static Logger _logger = Logger.getLogger(FASTADatabase.class);
48
49 public static class Config {
50 @Option(symbol = "h", longName = "help", description = "display help message")
51 public boolean dispalyHelp = false;
52 @Option(symbol = "o", longName = "out", varName = "DBNAME", description = "output SQLite database file name")
53 public String outputDBName = null;
54
55 @Argument(index = 0, required = false, name = "input FASTA file")
56 public String inputFASTAFile = null;
57 }
58
59 static class CompressedBuffer {
60 private ByteArrayOutputStream buf;
61 private GZIPOutputStream compressor;
62 private int writtenSize;
63
64 public CompressedBuffer() throws IOException {
65 reset();
66 }
67
68 public int writtenSize() {
69 return writtenSize;
70 }
71
72 public void write(byte[] data) throws IOException {
73 compressor.write(data);
74 writtenSize += data.length;
75 }
76
77 public byte[] toByteArray() throws IOException {
78 compressor.finish();
79 byte[] ret = buf.toByteArray();
80 return ret;
81 }
82
83 public void reset() throws IOException {
84 buf = new ByteArrayOutputStream();
85 compressor = new GZIPOutputStream(buf);
86 writtenSize = 0;
87 }
88
89 }
90
91 private static final int SEQUENCE_FRAGMENT_LENGTH = 100000;
92
93 public void createDB(Reader fasta, SQLiteAccess db) throws Exception {
94 createDB(new FASTAPullParser(fasta), db);
95 }
96
97 public void createDB(FASTAPullParser pullParser, SQLiteAccess db) throws Exception {
98 try {
99 db.setAutoCommit(true);
100 db.update("pragma synchronous=off");
101 db.setAutoCommit(false);
102
103
104 db.update("drop table if exists description");
105 db.update("drop table if exists sequence");
106 db.update("drop table if exists sequence_length");
107 db.update("create table description (id integer primary key not null, description string, fullDesc string)");
108 db.update("create index description_index on description(description)");
109 db.update("create table sequence (description_id integer not null, start integer, end integer, sequence string, primary key (description_id, start))");
110 db.update("create table sequence_length (description_id integer primary_key not null, length integer)");
111
112 stopWatch.reset();
113
114 int count = 1;
115 String description = null;
116
117
118 while ((description = pullParser.nextDescriptionLine()) != null) {
119 long start = 1;
120 long end = 1;
121
122 String chr = CompactFASTA.pickSequenceName(description);
123 _logger.info("new entry: " + chr);
124
125 db.update(SQLExpression.fillTemplate("insert into description values($1, $2, $3)", count, SQLUtil.singleQuote(chr),
126 SQLUtil.singleQuote(description)));
127
128 CompressedBuffer buffer = new CompressedBuffer();
129
130
131 String seq = null;
132 while ((seq = pullParser.nextSequenceLine()) != null) {
133 final int seqLen = seq.length();
134 for (int cursor = 0; cursor < seqLen;) {
135 final int storedLen = buffer.writtenSize();
136 final int fragmentSize = storedLen + seqLen - cursor;
137 if (fragmentSize > SEQUENCE_FRAGMENT_LENGTH) {
138 int seqEnd = cursor + SEQUENCE_FRAGMENT_LENGTH - storedLen;
139 buffer.write(seq.substring(cursor, seqEnd).getBytes());
140 end = start + buffer.writtenSize() - 1;
141 insertSequence(db, count, start, end, buffer.toByteArray());
142 start = end + 1;
143 buffer.reset();
144 cursor = seqEnd;
145 }
146 else {
147 buffer.write(cursor == 0 ? seq.getBytes() : seq.substring(cursor).getBytes());
148 cursor = seq.length();
149 }
150 }
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169 }
170 if (buffer.writtenSize() > 0) {
171 end = start + buffer.writtenSize() - 1;
172 insertSequence(db, count, start, end, buffer.toByteArray());
173 start = end + 1;
174 }
175
176
177 db.update(SQLExpression.fillTemplate("insert into sequence_length values($1, $2)", count, end));
178
179 count++;
180 }
181 }
182 catch (DBException e) {
183 _logger.error(e);
184 }
185 finally {
186 db.update("commit");
187
188 }
189
190 _logger.info("done.");
191
192 }
193
194 public static void main(String[] args) throws Exception {
195
196 Config conf = new Config();
197 OptionParser optionParser = new OptionParser(conf);
198
199 optionParser.parse(args);
200
201 if (conf.dispalyHelp) {
202 optionParser.printUsage();
203 return;
204 }
205
206 Reader input = null;
207 String fastaName = null;
208 if (conf.inputFASTAFile != null) {
209 File fastaFile = new File(conf.inputFASTAFile);
210 _logger.info("fasta file: " + fastaFile);
211 if (!fastaFile.exists())
212 throw new Exception(fastaFile.getName() + " does not exist");
213 input = new BufferedReader(new FileReader(fastaFile));
214 fastaName = fastaFile.getName();
215 }
216 else {
217 input = new InputStreamReader(System.in);
218 fastaName = "out";
219 }
220
221 assert (fastaName != null);
222
223 String dbName = conf.outputDBName != null ? conf.outputDBName : fastaName + ".db";
224 _logger.info("output sqlite db file: " + dbName);
225
226 FASTADatabase p = new FASTADatabase();
227 SQLiteAccess db = new SQLiteAccess(dbName);
228 p.createDB(input, db);
229 db.dispose();
230 }
231
232 private static int insertCount = 0;
233 private static StopWatch stopWatch = new StopWatch();
234
235 private static void insertSequence(SQLiteAccess db, int descriptionID, long start, long end, byte[] sequence) throws DBException {
236
237 db.updateWithPreparedStatement(SQLExpression.fillTemplate("insert into sequence values($1, $2, $3, ?)", descriptionID, start, end), new SequenceSetter(
238 sequence));
239
240 insertCount++;
241 if ((insertCount % 10000) == 0) {
242 _logger.info("inserted " + insertCount + "\t" + stopWatch.getElapsedTime() + " sec.");
243 }
244 }
245
246
247
248
249
250
251
252 static class SequenceSetter implements PreparedStatementHandler {
253 private final byte[] sequence;
254
255 public SequenceSetter(byte[] sequence) {
256 this.sequence = sequence;
257 }
258
259 public void setup(PreparedStatement preparedStatement) throws SQLException {
260 preparedStatement.setBytes(1, sequence);
261
262 }
263
264 }
265
266 public static void querySequence(File dbFile, ChrLoc location, BeanResultHandler<NSeq> handler) throws UTGBException {
267 if (!dbFile.exists())
268 throw new UTGBException(UTGBErrorCode.MISSING_FILES, "DB file doesn't exist: " + dbFile);
269
270 SQLiteAccess db = null;
271 try {
272 try {
273 db = new SQLiteAccess(dbFile.getAbsolutePath());
274 querySequence(db, location, handler);
275 }
276 finally {
277 if (db != null)
278 db.dispose();
279 }
280 }
281 catch (Exception e) {
282 throw UTGBException.convert(e);
283 }
284
285 }
286
287 public static void querySequence(SQLiteAccess db, ChrLoc location, BeanResultHandler<NSeq> handler) throws UTGBException {
288
289 try {
290 int start = location.viewStart();
291 int end = location.viewEnd();
292 int searchStart = (start / SEQUENCE_FRAGMENT_LENGTH) * SEQUENCE_FRAGMENT_LENGTH + 1;
293
294 String sql = SQLExpression.fillTemplate("select start, end, sequence from " + "(select * from description where description= '$1') as description "
295 + "join sequence on sequence.description_id = description.id " + "where start between $2 and $3 " + "and end > $4 order by start",
296 location.chr, searchStart, end, start);
297
298 db.query(sql, NSeq.class, handler);
299 }
300 catch (Exception e) {
301 throw UTGBException.convert(e);
302 }
303
304 }
305
306
307
308
309
310
311
312 public static class NSeq {
313 private int start;
314 private int end;
315 private byte[] sequence;
316
317 public NSeq() {
318 }
319
320 public NSeq(int start, int end, byte[] sequence) {
321 this.start = start;
322 this.end = end;
323 this.sequence = sequence;
324 }
325
326 @Override
327 public String toString() {
328 StringBuilder buf = new StringBuilder();
329 buf.append("(");
330 buf.append(start);
331 buf.append(",");
332 buf.append(end);
333 buf.append(")");
334 return buf.toString();
335 }
336
337 public void setStart(int start) {
338 this.start = start;
339 }
340
341 public void setEnd(int end) {
342 this.end = end;
343 }
344
345 public int getStart() {
346 return start;
347 }
348
349 public int getEnd() {
350 return end;
351 }
352
353
354
355
356
357
358
359
360 public String getSubSequence(int bufStart, int bufEnd) {
361 return new String(sequence, bufStart, bufEnd - bufStart);
362 }
363
364 public int getLength() {
365 return sequence.length;
366 }
367
368 public byte[] getSequence() {
369 return sequence;
370 }
371
372 public void setSequence(byte[] sequence) throws IOException {
373 GZIPInputStream decompressor = new GZIPInputStream(new ByteArrayInputStream(sequence));
374 ByteArrayOutputStream b = new ByteArrayOutputStream();
375 byte[] buf = new byte[8192];
376 int readBytes = 0;
377 while ((readBytes = decompressor.read(buf)) != -1) {
378 b.write(buf, 0, readBytes);
379 }
380 this.sequence = b.toByteArray();
381 }
382
383 }
384
385 }