View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // KeywordDBGenerator.java
20  // Since: May 20, 2010
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.format.keyword;
26  
27  import java.io.BufferedReader;
28  import java.io.File;
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.io.Reader;
32  import java.sql.ResultSet;
33  import java.sql.SQLException;
34  import java.util.ArrayList;
35  import java.util.List;
36  
37  import net.sf.samtools.SAMFileReader;
38  import net.sf.samtools.SAMRecord;
39  import net.sf.samtools.SAMFileReader.ValidationStringency;
40  
41  import org.utgenome.UTGBErrorCode;
42  import org.utgenome.UTGBException;
43  import org.utgenome.format.bed.BED2SilkReader;
44  import org.utgenome.format.bed.BEDEntry;
45  import org.utgenome.format.bed.BEDQuery;
46  import org.utgenome.format.bed.BEDTrack;
47  import org.utgenome.format.fasta.CompactFASTAIndex;
48  import org.utgenome.format.keyword.GenomeKeywordEntry.KeywordAlias;
49  import org.utgenome.gwt.utgb.client.bio.KeywordSearchResult;
50  import org.xerial.db.DBException;
51  import org.xerial.db.sql.ResultSetHandler;
52  import org.xerial.db.sql.SQLExpression;
53  import org.xerial.db.sql.SQLUtil;
54  import org.xerial.db.sql.sqlite.SQLiteAccess;
55  import org.xerial.db.sql.sqlite.SQLiteCatalog;
56  import org.xerial.util.StringUtil;
57  import org.xerial.util.log.Logger;
58  
59  /**
60   * Keyword database generator/query interface
61   * 
62   * @author leo
63   * 
64   */
65  public class KeywordDB {
66  
67  	private static Logger _logger = Logger.getLogger(KeywordDB.class);
68  
69  	private SQLiteAccess db;
70  
71  	public KeywordDB(File dbPath) throws DBException {
72  		this(new SQLiteAccess(dbPath.getAbsolutePath()));
73  	}
74  
75  	public KeywordDB(String dbPath) throws DBException, UTGBException {
76  		this(new SQLiteAccess(dbPath));
77  	}
78  
79  	public KeywordDB(SQLiteAccess db) throws DBException {
80  		this.db = db;
81  	}
82  
83  	public void initDB() throws DBException {
84  
85  		SQLiteCatalog catalog = db.getCatalog();
86  		if (!catalog.getTableNameSet().contains("entry")) {
87  			db.update("create table if not exists entry(chr text, start integer, end integer, original_keyword text)");
88  			db.update("create virtual table alias_table using fts3(keyword text, alias text)");
89  			db.update("create virtual table keyword_index using fts3(ref text, keyword text)");
90  		}
91  	}
92  
93  	public KeywordSearchResult query(String ref, String keyword, int page, int pageSize) throws Exception {
94  
95  		if (pageSize > 100)
96  			pageSize = 100;
97  		if (pageSize <= 0)
98  			pageSize = 10;
99  
100 		final KeywordSearchResult r = new KeywordSearchResult();
101 		r.page = page;
102 		String sKeyword = sanitize(keyword);
103 		String keywordSegmentsWithStar = splitAndAddStar(keyword);
104 		if (keywordSegmentsWithStar == null)
105 			return r;
106 
107 		// search alias
108 		String aliasSearchKeyword = null;
109 		String aliasQuery = SQLExpression.fillTemplate("select distinct(keyword), alias from alias_table where alias match \"$1\"", keywordSegmentsWithStar);
110 		List<KeywordAlias> aliases = db.query(aliasQuery, GenomeKeywordEntry.KeywordAlias.class);
111 		ArrayList<String> keywords = new ArrayList<String>();
112 		//keywords.add(sKeyword);
113 		for (KeywordAlias each : aliases) {
114 			keywords.add(sanitize(each.keyword));
115 		}
116 		aliasSearchKeyword = StringUtil.join(keywords, " OR ");
117 
118 		String refCondition = (ref == null) ? "" : SQLExpression.fillTemplate("ref=\"$1\" and ", ref);
119 
120 		String perfectMatchQuery = SQLExpression.fillTemplate(
121 				"select rowid as id, 1 as priority, offsets(keyword_index) as offsets, * from keyword_index where $1 keyword match $2", refCondition, SQLUtil
122 						.doubleQuote(sKeyword));
123 
124 		String aliasPerfectMatchQuery = SQLExpression.fillTemplate(
125 				"select rowid as id, 2 as priority, offsets(keyword_index) as offsets, * from keyword_index where $1 keyword match $2", refCondition, SQLUtil
126 						.doubleQuote(aliasSearchKeyword));
127 
128 		String forwardMatchQuery = SQLExpression.fillTemplate(
129 				"select rowid as id, 3 as priority, offsets(keyword_index) as offsets, * from keyword_index where $1 keyword match $2", refCondition, SQLUtil
130 						.doubleQuote(keywordSegmentsWithStar + " NOT " + sKeyword));
131 
132 		String unionSQL = SQLExpression.fillTemplate("select * from ($1 union all $2 union all $3)", perfectMatchQuery, aliasPerfectMatchQuery,
133 				forwardMatchQuery);
134 
135 		// count the search results
136 		String countSQL = SQLExpression.fillTemplate("select count(*) as count from (select distinct id from ($1))", unionSQL);
137 
138 		db.query(countSQL, new ResultSetHandler<Void>() {
139 			@Override
140 			public Void handle(ResultSet rs) throws SQLException {
141 				r.count = rs.getInt(1);
142 				return null;
143 			}
144 		});
145 
146 		r.maxPage = r.count / pageSize + (r.count % pageSize == 0 ? 0 : 1);
147 
148 		String searchSQLTemplate = "select distinct t.id as id, original_keyword as name, offsets, ref, chr, start, end "
149 				+ "from ($1) t join entry on t.id = entry.rowid order by priority, chr, start limit $2 offset $3";
150 		String keywordSearchSQL = SQLExpression.fillTemplate(searchSQLTemplate, unionSQL, pageSize, pageSize * (page - 1));
151 
152 		r.result = db.query(keywordSearchSQL, KeywordSearchResult.Entry.class);
153 
154 		return r;
155 	}
156 
157 	public static String splitAndAddStar(String keyword) {
158 		if (keyword != null) {
159 			String sunitizedKeyword = sanitize(keyword);
160 			String[] segment = sunitizedKeyword.split("\\s+");
161 			if (segment == null)
162 				return keyword;
163 
164 			ArrayList<String> keywordList = new ArrayList<String>();
165 			for (String s : segment) {
166 				keywordList.add(s + "*");
167 			}
168 			return StringUtil.join(keywordList, " AND ");
169 		}
170 		else
171 			return null;
172 	}
173 
174 	public static String sanitize(String text) {
175 		if (text == null)
176 			return null;
177 
178 		return text.replaceAll("[\\p{Punct}]", "");
179 	}
180 
181 	public void add(GenomeKeywordEntry entry) throws DBException {
182 
183 		String sKeyword = sanitize(entry.text);
184 
185 		String newEntrySQL = SQLExpression.fillTemplate("insert into entry values('$1', $2, $3, '$4')", entry.chr, entry.start, entry.end, entry.text);
186 		String newIndexSQL = SQLExpression.fillTemplate("insert into keyword_index values('$1', '$2')", entry.ref, sKeyword);
187 
188 		db.update(newEntrySQL);
189 		db.update(newIndexSQL);
190 	}
191 
192 	public void add(KeywordAlias alias) throws DBException {
193 		String sql = SQLExpression.fillTemplate("insert into alias_table values('$1', '$2')", alias.keyword, sanitize(alias.alias));
194 		db.update(sql);
195 	}
196 
197 	public void importKeywordAliasFile(Reader keywordAliasFile) throws UTGBException {
198 		try {
199 			initDB();
200 
201 			KeywordAliasReader r = new KeywordAliasReader(keywordAliasFile);
202 			KeywordAlias alias;
203 			db.update("pragma synchronous = off");
204 			db.setAutoCommit(false);
205 			while ((alias = r.next()) != null) {
206 				add(alias);
207 			}
208 			db.update("commit");
209 		}
210 		catch (Exception e) {
211 			throw UTGBException.convert(e);
212 		}
213 	}
214 
215 	public void importFromFASTAIndex(String ref, Reader fastaIndexFile) throws UTGBException {
216 		try {
217 			initDB();
218 
219 			db.update("pragma synchronous = off");
220 			db.setAutoCommit(false);
221 
222 			List<CompactFASTAIndex> index = CompactFASTAIndex.load(fastaIndexFile);
223 			for (CompactFASTAIndex each : index) {
224 				GenomeKeywordEntry e = new GenomeKeywordEntry(ref, each.name, each.name, 1, 1);
225 				add(e);
226 			}
227 			db.update("commit");
228 		}
229 		catch (Exception e) {
230 			throw UTGBException.convert(e);
231 		}
232 	}
233 
234 	public void importFromTAB(String ref, Reader tabFileReader) throws UTGBException {
235 
236 		int entryCount = 0;
237 		int lineCount = 1;
238 		try {
239 			initDB();
240 			db.update("pragma synchronous = off");
241 			db.setAutoCommit(false);
242 
243 			BufferedReader input = new BufferedReader(tabFileReader);
244 			for (String line; (line = input.readLine()) != null; lineCount++) {
245 				// skip the comment line
246 				if (line.startsWith("#"))
247 					continue;
248 
249 				try {
250 					String[] column = line.split("\t");
251 					if (column.length < 4) {
252 						throw new IllegalArgumentException();
253 					}
254 					// chr, start, end, text ....
255 					String chr = column[0];
256 					int start = Integer.parseInt(column[1]);
257 					int end = Integer.parseInt(column[2]);
258 					StringBuilder buf = new StringBuilder();
259 					for (int i = 3; i < column.length; ++i) {
260 						if (i != 3)
261 							buf.append(" ");
262 						buf.append(column[i]);
263 					}
264 					GenomeKeywordEntry entry = new GenomeKeywordEntry(ref, chr, buf.toString(), start, end);
265 
266 					if (entryCount > 0 && (entryCount % 10000 == 0))
267 						_logger.info("num entries: " + entryCount);
268 					entryCount++;
269 
270 					add(entry);
271 				}
272 				catch (IllegalArgumentException e) {
273 					_logger.warn(String.format("line %d has invalid format: %s (%s)", lineCount, line, e.getMessage()));
274 				}
275 				catch (DBException e) {
276 					_logger.error(String.format("line %d: insertion error %s", lineCount, e.getMessage()));
277 					e.printStackTrace(System.err);
278 				}
279 			}
280 
281 			db.update("commit");
282 		}
283 		catch (DBException e) {
284 			_logger.error("error at line " + lineCount);
285 			throw new UTGBException(UTGBErrorCode.DatabaseError, e);
286 		}
287 		catch (IOException e) {
288 			throw new UTGBException(UTGBErrorCode.IO_ERROR, e);
289 		}
290 	}
291 
292 	public void importFromBED(final String ref, Reader bed) throws UTGBException {
293 
294 		try {
295 			initDB();
296 
297 			db.update("pragma synchronous = off");
298 			db.setAutoCommit(false);
299 
300 			BED2SilkReader.scan(bed, new BEDQuery() {
301 
302 				private int entryCount = 0;
303 
304 				public void addGene(BEDEntry gene) {
305 					entryCount++;
306 					GenomeKeywordEntry e = new GenomeKeywordEntry(ref, gene.coordinate, gene.getName(), gene.getStart(), gene.getEnd());
307 					try {
308 						if (entryCount > 0 && (entryCount % 10000 == 0))
309 							_logger.info("num entries: " + entryCount);
310 
311 						KeywordDB.this.add(e);
312 					}
313 					catch (DBException e1) {
314 						reportError(e1);
315 					}
316 				}
317 
318 				public void addTrack(BEDTrack track) {
319 					// simply ignore
320 				}
321 
322 				public void reportError(Exception e) {
323 					_logger.error(e);
324 				}
325 			});
326 
327 			db.update("commit");
328 		}
329 		catch (DBException e) {
330 			throw new UTGBException(UTGBErrorCode.DatabaseError, e);
331 		}
332 
333 	}
334 
335 	public void importFromBAM(final String ref, InputStream samOrBamInput) throws UTGBException {
336 
337 		SAMFileReader samReader = new SAMFileReader(samOrBamInput);
338 		int entryCount = 0;
339 		try {
340 			initDB();
341 			samReader.setValidationStringency(ValidationStringency.SILENT);
342 
343 			db.update("pragma synchronous = off");
344 			db.setAutoCommit(false);
345 
346 			for (SAMRecord each : samReader) {
347 				try {
348 					entryCount++;
349 					GenomeKeywordEntry e = new GenomeKeywordEntry(ref, each.getReferenceName(), each.getReadName(), each.getUnclippedStart(), each
350 							.getUnclippedStart());
351 					if (entryCount > 0 && (entryCount % 10000 == 0))
352 						_logger.info("num entries: " + entryCount);
353 					KeywordDB.this.add(e);
354 				}
355 				catch (DBException e) {
356 					_logger.error(e);
357 				}
358 			}
359 			db.update("commit");
360 		}
361 		catch (DBException e) {
362 			throw new UTGBException(UTGBErrorCode.DatabaseError, e);
363 		}
364 		finally {
365 			samReader.close();
366 		}
367 
368 	}
369 
370 	public void close() throws DBException {
371 		db.dispose();
372 	}
373 
374 }