1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.keyword;
26
27 import java.io.BufferedReader;
28 import java.io.File;
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.io.Reader;
32 import java.sql.ResultSet;
33 import java.sql.SQLException;
34 import java.util.ArrayList;
35 import java.util.List;
36
37 import net.sf.samtools.SAMFileReader;
38 import net.sf.samtools.SAMRecord;
39 import net.sf.samtools.SAMFileReader.ValidationStringency;
40
41 import org.utgenome.UTGBErrorCode;
42 import org.utgenome.UTGBException;
43 import org.utgenome.format.bed.BED2SilkReader;
44 import org.utgenome.format.bed.BEDEntry;
45 import org.utgenome.format.bed.BEDQuery;
46 import org.utgenome.format.bed.BEDTrack;
47 import org.utgenome.format.fasta.CompactFASTAIndex;
48 import org.utgenome.format.keyword.GenomeKeywordEntry.KeywordAlias;
49 import org.utgenome.gwt.utgb.client.bio.KeywordSearchResult;
50 import org.xerial.db.DBException;
51 import org.xerial.db.sql.ResultSetHandler;
52 import org.xerial.db.sql.SQLExpression;
53 import org.xerial.db.sql.SQLUtil;
54 import org.xerial.db.sql.sqlite.SQLiteAccess;
55 import org.xerial.db.sql.sqlite.SQLiteCatalog;
56 import org.xerial.util.StringUtil;
57 import org.xerial.util.log.Logger;
58
59
60
61
62
63
64
65 public class KeywordDB {
66
67 private static Logger _logger = Logger.getLogger(KeywordDB.class);
68
69 private SQLiteAccess db;
70
71 public KeywordDB(File dbPath) throws DBException {
72 this(new SQLiteAccess(dbPath.getAbsolutePath()));
73 }
74
75 public KeywordDB(String dbPath) throws DBException, UTGBException {
76 this(new SQLiteAccess(dbPath));
77 }
78
79 public KeywordDB(SQLiteAccess db) throws DBException {
80 this.db = db;
81 }
82
83 public void initDB() throws DBException {
84
85 SQLiteCatalog catalog = db.getCatalog();
86 if (!catalog.getTableNameSet().contains("entry")) {
87 db.update("create table if not exists entry(chr text, start integer, end integer, original_keyword text)");
88 db.update("create virtual table alias_table using fts3(keyword text, alias text)");
89 db.update("create virtual table keyword_index using fts3(ref text, keyword text)");
90 }
91 }
92
93 public KeywordSearchResult query(String ref, String keyword, int page, int pageSize) throws Exception {
94
95 if (pageSize > 100)
96 pageSize = 100;
97 if (pageSize <= 0)
98 pageSize = 10;
99
100 final KeywordSearchResult r = new KeywordSearchResult();
101 r.page = page;
102 String sKeyword = sanitize(keyword);
103 String keywordSegmentsWithStar = splitAndAddStar(keyword);
104 if (keywordSegmentsWithStar == null)
105 return r;
106
107
108 String aliasSearchKeyword = null;
109 String aliasQuery = SQLExpression.fillTemplate("select distinct(keyword), alias from alias_table where alias match \"$1\"", keywordSegmentsWithStar);
110 List<KeywordAlias> aliases = db.query(aliasQuery, GenomeKeywordEntry.KeywordAlias.class);
111 ArrayList<String> keywords = new ArrayList<String>();
112
113 for (KeywordAlias each : aliases) {
114 keywords.add(sanitize(each.keyword));
115 }
116 aliasSearchKeyword = StringUtil.join(keywords, " OR ");
117
118 String refCondition = (ref == null) ? "" : SQLExpression.fillTemplate("ref=\"$1\" and ", ref);
119
120 String perfectMatchQuery = SQLExpression.fillTemplate(
121 "select rowid as id, 1 as priority, offsets(keyword_index) as offsets, * from keyword_index where $1 keyword match $2", refCondition, SQLUtil
122 .doubleQuote(sKeyword));
123
124 String aliasPerfectMatchQuery = SQLExpression.fillTemplate(
125 "select rowid as id, 2 as priority, offsets(keyword_index) as offsets, * from keyword_index where $1 keyword match $2", refCondition, SQLUtil
126 .doubleQuote(aliasSearchKeyword));
127
128 String forwardMatchQuery = SQLExpression.fillTemplate(
129 "select rowid as id, 3 as priority, offsets(keyword_index) as offsets, * from keyword_index where $1 keyword match $2", refCondition, SQLUtil
130 .doubleQuote(keywordSegmentsWithStar + " NOT " + sKeyword));
131
132 String unionSQL = SQLExpression.fillTemplate("select * from ($1 union all $2 union all $3)", perfectMatchQuery, aliasPerfectMatchQuery,
133 forwardMatchQuery);
134
135
136 String countSQL = SQLExpression.fillTemplate("select count(*) as count from (select distinct id from ($1))", unionSQL);
137
138 db.query(countSQL, new ResultSetHandler<Void>() {
139 @Override
140 public Void handle(ResultSet rs) throws SQLException {
141 r.count = rs.getInt(1);
142 return null;
143 }
144 });
145
146 r.maxPage = r.count / pageSize + (r.count % pageSize == 0 ? 0 : 1);
147
148 String searchSQLTemplate = "select distinct t.id as id, original_keyword as name, offsets, ref, chr, start, end "
149 + "from ($1) t join entry on t.id = entry.rowid order by priority, chr, start limit $2 offset $3";
150 String keywordSearchSQL = SQLExpression.fillTemplate(searchSQLTemplate, unionSQL, pageSize, pageSize * (page - 1));
151
152 r.result = db.query(keywordSearchSQL, KeywordSearchResult.Entry.class);
153
154 return r;
155 }
156
157 public static String splitAndAddStar(String keyword) {
158 if (keyword != null) {
159 String sunitizedKeyword = sanitize(keyword);
160 String[] segment = sunitizedKeyword.split("\\s+");
161 if (segment == null)
162 return keyword;
163
164 ArrayList<String> keywordList = new ArrayList<String>();
165 for (String s : segment) {
166 keywordList.add(s + "*");
167 }
168 return StringUtil.join(keywordList, " AND ");
169 }
170 else
171 return null;
172 }
173
174 public static String sanitize(String text) {
175 if (text == null)
176 return null;
177
178 return text.replaceAll("[\\p{Punct}]", "");
179 }
180
181 public void add(GenomeKeywordEntry entry) throws DBException {
182
183 String sKeyword = sanitize(entry.text);
184
185 String newEntrySQL = SQLExpression.fillTemplate("insert into entry values('$1', $2, $3, '$4')", entry.chr, entry.start, entry.end, entry.text);
186 String newIndexSQL = SQLExpression.fillTemplate("insert into keyword_index values('$1', '$2')", entry.ref, sKeyword);
187
188 db.update(newEntrySQL);
189 db.update(newIndexSQL);
190 }
191
192 public void add(KeywordAlias alias) throws DBException {
193 String sql = SQLExpression.fillTemplate("insert into alias_table values('$1', '$2')", alias.keyword, sanitize(alias.alias));
194 db.update(sql);
195 }
196
197 public void importKeywordAliasFile(Reader keywordAliasFile) throws UTGBException {
198 try {
199 initDB();
200
201 KeywordAliasReader r = new KeywordAliasReader(keywordAliasFile);
202 KeywordAlias alias;
203 db.update("pragma synchronous = off");
204 db.setAutoCommit(false);
205 while ((alias = r.next()) != null) {
206 add(alias);
207 }
208 db.update("commit");
209 }
210 catch (Exception e) {
211 throw UTGBException.convert(e);
212 }
213 }
214
215 public void importFromFASTAIndex(String ref, Reader fastaIndexFile) throws UTGBException {
216 try {
217 initDB();
218
219 db.update("pragma synchronous = off");
220 db.setAutoCommit(false);
221
222 List<CompactFASTAIndex> index = CompactFASTAIndex.load(fastaIndexFile);
223 for (CompactFASTAIndex each : index) {
224 GenomeKeywordEntry e = new GenomeKeywordEntry(ref, each.name, each.name, 1, 1);
225 add(e);
226 }
227 db.update("commit");
228 }
229 catch (Exception e) {
230 throw UTGBException.convert(e);
231 }
232 }
233
234 public void importFromTAB(String ref, Reader tabFileReader) throws UTGBException {
235
236 int entryCount = 0;
237 int lineCount = 1;
238 try {
239 initDB();
240 db.update("pragma synchronous = off");
241 db.setAutoCommit(false);
242
243 BufferedReader input = new BufferedReader(tabFileReader);
244 for (String line; (line = input.readLine()) != null; lineCount++) {
245
246 if (line.startsWith("#"))
247 continue;
248
249 try {
250 String[] column = line.split("\t");
251 if (column.length < 4) {
252 throw new IllegalArgumentException();
253 }
254
255 String chr = column[0];
256 int start = Integer.parseInt(column[1]);
257 int end = Integer.parseInt(column[2]);
258 StringBuilder buf = new StringBuilder();
259 for (int i = 3; i < column.length; ++i) {
260 if (i != 3)
261 buf.append(" ");
262 buf.append(column[i]);
263 }
264 GenomeKeywordEntry entry = new GenomeKeywordEntry(ref, chr, buf.toString(), start, end);
265
266 if (entryCount > 0 && (entryCount % 10000 == 0))
267 _logger.info("num entries: " + entryCount);
268 entryCount++;
269
270 add(entry);
271 }
272 catch (IllegalArgumentException e) {
273 _logger.warn(String.format("line %d has invalid format: %s (%s)", lineCount, line, e.getMessage()));
274 }
275 catch (DBException e) {
276 _logger.error(String.format("line %d: insertion error %s", lineCount, e.getMessage()));
277 e.printStackTrace(System.err);
278 }
279 }
280
281 db.update("commit");
282 }
283 catch (DBException e) {
284 _logger.error("error at line " + lineCount);
285 throw new UTGBException(UTGBErrorCode.DatabaseError, e);
286 }
287 catch (IOException e) {
288 throw new UTGBException(UTGBErrorCode.IO_ERROR, e);
289 }
290 }
291
292 public void importFromBED(final String ref, Reader bed) throws UTGBException {
293
294 try {
295 initDB();
296
297 db.update("pragma synchronous = off");
298 db.setAutoCommit(false);
299
300 BED2SilkReader.scan(bed, new BEDQuery() {
301
302 private int entryCount = 0;
303
304 public void addGene(BEDEntry gene) {
305 entryCount++;
306 GenomeKeywordEntry e = new GenomeKeywordEntry(ref, gene.coordinate, gene.getName(), gene.getStart(), gene.getEnd());
307 try {
308 if (entryCount > 0 && (entryCount % 10000 == 0))
309 _logger.info("num entries: " + entryCount);
310
311 KeywordDB.this.add(e);
312 }
313 catch (DBException e1) {
314 reportError(e1);
315 }
316 }
317
318 public void addTrack(BEDTrack track) {
319
320 }
321
322 public void reportError(Exception e) {
323 _logger.error(e);
324 }
325 });
326
327 db.update("commit");
328 }
329 catch (DBException e) {
330 throw new UTGBException(UTGBErrorCode.DatabaseError, e);
331 }
332
333 }
334
335 public void importFromBAM(final String ref, InputStream samOrBamInput) throws UTGBException {
336
337 SAMFileReader samReader = new SAMFileReader(samOrBamInput);
338 int entryCount = 0;
339 try {
340 initDB();
341 samReader.setValidationStringency(ValidationStringency.SILENT);
342
343 db.update("pragma synchronous = off");
344 db.setAutoCommit(false);
345
346 for (SAMRecord each : samReader) {
347 try {
348 entryCount++;
349 GenomeKeywordEntry e = new GenomeKeywordEntry(ref, each.getReferenceName(), each.getReadName(), each.getUnclippedStart(), each
350 .getUnclippedStart());
351 if (entryCount > 0 && (entryCount % 10000 == 0))
352 _logger.info("num entries: " + entryCount);
353 KeywordDB.this.add(e);
354 }
355 catch (DBException e) {
356 _logger.error(e);
357 }
358 }
359 db.update("commit");
360 }
361 catch (DBException e) {
362 throw new UTGBException(UTGBErrorCode.DatabaseError, e);
363 }
364 finally {
365 samReader.close();
366 }
367
368 }
369
370 public void close() throws DBException {
371 db.dispose();
372 }
373
374 }