View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // CompactFASTA.java
20  // Since: 2010/03/12
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.format.fasta;
26  
27  import java.io.BufferedReader;
28  import java.io.File;
29  import java.io.FileInputStream;
30  import java.io.FileNotFoundException;
31  import java.io.FileReader;
32  import java.io.IOException;
33  import java.io.RandomAccessFile;
34  import java.util.Collections;
35  import java.util.LinkedHashMap;
36  import java.util.Set;
37  
38  import org.utgenome.UTGBException;
39  import org.xerial.core.XerialException;
40  
41  /**
42   * CompactFASTA is a packed FASTA file, supporting random accesses to sub sequences of a specified chromosome.
43   * 
44   * @author leo
45   * 
46   */
47  public class CompactFASTA {
48  
49  	public final static String PAC_FILE_SUFFIX = ".pac";
50  	public final static String PAC_N_FILE_SUFFIX = ".pacn";
51  	public final static String PAC_INDEX_FILE_SUFFIX = ".i.silk";
52  
53  	private final LinkedHashMap<String, CompactFASTAIndex> indexTable = new LinkedHashMap<String, CompactFASTAIndex>();
54  
55  	private final String fastaFilePrefix;
56  	private final PacFileAccess access;
57  
58  	/**
59  	 * @param fastaFilePrefix
60  	 * @throws XerialException
61  	 * @throws IOException
62  	 */
63  	public CompactFASTA(String fastaFile) throws XerialException, IOException {
64  		this(fastaFile, false);
65  	}
66  
67  	private CompactFASTA(String fastaFile, boolean loadIntoMemory) throws XerialException, IOException {
68  		this.fastaFilePrefix = fastaFile;
69  
70  		File indexFile = new File(fastaFilePrefix + PAC_INDEX_FILE_SUFFIX);
71  		for (CompactFASTAIndex each : CompactFASTAIndex.load(new BufferedReader(new FileReader(indexFile)))) {
72  			indexTable.put(each.name, each);
73  		}
74  
75  		File pacFile = new File(fastaFilePrefix + PAC_FILE_SUFFIX);
76  		File pacNFile = new File(fastaFilePrefix + PAC_N_FILE_SUFFIX);
77  		if (!loadIntoMemory)
78  			access = new OnDiskAccess(pacFile, pacNFile);
79  		else
80  			access = new OnMemoryBuffer(pacFile, pacNFile);
81  	}
82  
83  	/**
84  	 * Get the set of chromosome names
85  	 * 
86  	 * @return
87  	 */
88  	public Set<String> getChrSet() {
89  		return Collections.unmodifiableSet(indexTable.keySet());
90  	}
91  
92  	/**
93  	 * Test the specified chromosome name is in this FASTA
94  	 * 
95  	 * @param chr
96  	 * @return
97  	 */
98  	public boolean containsChr(String chr) {
99  		return indexTable.containsKey(chr);
100 	}
101 
102 	public static CompactFASTA loadIntoMemory(String fastaFilePrefix) throws XerialException, IOException {
103 		return new CompactFASTA(fastaFilePrefix, true);
104 	}
105 
106 	public void close() throws IOException {
107 		access.close();
108 	}
109 
110 	/**
111 	 * Retrieves a genome sequence of the specified range [start, end)
112 	 * 
113 	 * @param chr
114 	 *            sequence name
115 	 * @param start
116 	 *            start position on the genome (0-origin)
117 	 * @param end
118 	 *            end position on genome (0-origin, exclusive)
119 	 * @return genome sequence of the specified range, or null if no entry found for the given sequence name
120 	 * @throws IOException
121 	 * @throws UTGBException
122 	 */
123 	public CompactACGT getSequence(String chr, int start, int end) throws IOException, UTGBException {
124 		if (!indexTable.containsKey(chr))
125 			return null;
126 		CompactFASTAIndex index = indexTable.get(chr);
127 		return getSequence(index, start, end);
128 	}
129 
130 	CompactACGT getSequence(CompactFASTAIndex index, int start, int end) throws IOException, UTGBException {
131 		if (index == null)
132 			throw new IllegalArgumentException("index must not be null");
133 
134 		if (start > end) {
135 			int tmp = end;
136 			end = start;
137 			start = tmp;
138 		}
139 
140 		int length = end - start;
141 		if (length > index.length)
142 			length = (index.length - start);
143 
144 		long bStart = start + index.offset;
145 		long bEnd = bStart + length;
146 		long pac_lowerBound = bStart / 4;
147 		long pac_upperBound = bEnd / 4 + (bEnd % 4 != 0 ? 1 : 0);
148 		long pacN_lowerBound = bStart / 8;
149 		long pacN_upperBound = bEnd / 8 + (bEnd % 8 != 0 ? 1 : 0);
150 
151 		//     s-------e 
152 		// |--------|------]
153 		byte[] seqBuf = new byte[(int) (pac_upperBound - pac_lowerBound)];
154 		byte[] seqNBuf = new byte[(int) (pacN_upperBound - pacN_lowerBound)];
155 
156 		access.readSeq(pac_lowerBound, seqBuf);
157 		access.readNSeq(pacN_lowerBound, seqNBuf);
158 		return new CompactACGT(seqBuf, seqNBuf, length, (int) bStart % 4);
159 
160 	}
161 
162 	public GenomeSequence getSequence(String chr, int start) throws IOException, UTGBException {
163 		if (!indexTable.containsKey(chr))
164 			return null;
165 		CompactFASTAIndex index = indexTable.get(chr);
166 		return getSequence(index, start, index.length);
167 	}
168 
169 	public GenomeSequence getSequence(String chr) throws IOException, UTGBException {
170 		return getSequence(chr, 0);
171 	}
172 
173 	public static String pickSequenceName(String descriptionLine) {
174 		int begin = 0;
175 		if (descriptionLine.length() > 0 && descriptionLine.charAt(0) == '>')
176 			begin++;
177 
178 		// skip leading white spaces
179 		for (; begin < descriptionLine.length(); ++begin) {
180 			char c = descriptionLine.charAt(begin);
181 			if (!(c == ' ' | c == '\t'))
182 				break;
183 		}
184 		int end = begin + 1;
185 		for (; end < descriptionLine.length(); ++end) {
186 			char c = descriptionLine.charAt(end);
187 			if (c == ' ' | c == '\t') {
188 				break;
189 			}
190 		}
191 		return descriptionLine.substring(begin, end);
192 	}
193 
194 	public interface PacFileAccess {
195 		public void readSeq(long cursor, byte[] buf) throws IOException;
196 
197 		public void readNSeq(long cursor, byte[] buf) throws IOException;
198 
199 		public void close() throws IOException;
200 	}
201 
202 	public class OnDiskAccess implements PacFileAccess {
203 		private final RandomAccessFile packedFASTA;
204 		private final RandomAccessFile packedFASTA_N;
205 
206 		public OnDiskAccess(File pacFile, File pacNFile) throws FileNotFoundException {
207 			packedFASTA = new RandomAccessFile(pacFile, "r");
208 			packedFASTA_N = new RandomAccessFile(pacNFile, "r");
209 		}
210 
211 		public void close() throws IOException {
212 			if (packedFASTA != null)
213 				packedFASTA.close();
214 			if (packedFASTA_N != null)
215 				packedFASTA_N.close();
216 		}
217 
218 		public void readNSeq(long cursor, byte[] buf) throws IOException {
219 			packedFASTA_N.seek(cursor);
220 			packedFASTA_N.read(buf);
221 		}
222 
223 		public void readSeq(long cursor, byte[] buf) throws IOException {
224 			packedFASTA.seek(cursor);
225 			packedFASTA.read(buf);
226 		}
227 
228 	}
229 
230 	public class OnMemoryBuffer implements PacFileAccess {
231 		byte[] pac;
232 		byte[] nPac;
233 
234 		public OnMemoryBuffer(File pacFile, File pacNFile) throws IOException {
235 			long pacSize = pacFile.length();
236 			long nPacSize = pacNFile.length();
237 
238 			// maximum: 4 * 2GB = 8G bases  
239 			pac = new byte[(int) pacSize];
240 			nPac = new byte[(int) nPacSize];
241 
242 			// read sequences
243 			FileInputStream f = new FileInputStream(pacFile);
244 			f.read(pac);
245 			f.close();
246 
247 			FileInputStream fn = new FileInputStream(pacNFile);
248 			fn.read(nPac);
249 			fn.close();
250 		}
251 
252 		public void readSeq(long cursor, byte[] buf) throws IOException {
253 			System.arraycopy(pac, (int) cursor, buf, 0, buf.length);
254 		}
255 
256 		public void readNSeq(long cursor, byte[] buf) throws IOException {
257 			System.arraycopy(nPac, (int) cursor, buf, 0, buf.length);
258 		}
259 
260 		public void close() throws IOException {
261 			// nothing to do
262 		}
263 
264 	}
265 
266 }