View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // CompactACGT.java
20  // Since: Feb 22, 2010
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.format.fasta;
26  
27  import java.io.ByteArrayOutputStream;
28  import java.io.IOException;
29  
30  import org.utgenome.UTGBErrorCode;
31  import org.utgenome.UTGBException;
32  import org.utgenome.gwt.utgb.client.bio.ACGTEncoder;
33  import org.utgenome.util.kmer.KmerIntegerFactory;
34  
35  /**
36   * Compact array for ACGT (and N) sequences
37   * 
38   * @author leo
39   * 
40   */
41  public class CompactACGT implements GenomeSequence {
42  
43  	final byte[] sequence; // 2 bit for each char
44  	final byte[] sequenceMask; // 1 bit for each char: 0 for ACGT, 1 for otherwise including N
45  	private final int length;
46  	private final int offset;
47  
48  	final static int CODE_SIZE = 2;
49  	final static int BYTE = 8;
50  
51  	/**
52  	 * @param sequence
53  	 *            packed sequence
54  	 * @param sequenceMask
55  	 *            packed sequence for N
56  	 * @param bitOffset
57  	 *            offset char length in the packed array
58  	 * @throws UTGBException
59  	 */
60  	CompactACGT(byte[] sequence, byte[] sequenceMask, int length, int offset) throws UTGBException {
61  		this.sequence = sequence;
62  		this.sequenceMask = sequenceMask;
63  
64  		this.length = length;
65  		this.offset = offset;
66  
67  		// assertion
68  		if ((this.sequence.length * (BYTE / CODE_SIZE)) - offset < length) {
69  			throw new UTGBException(UTGBErrorCode.INVALID_INPUT, String.format("packed array is shorter than the specified length: %d (offset=%d) < %d",
70  					sequence.length * (BYTE / CODE_SIZE), offset, length));
71  		}
72  		if ((this.sequenceMask.length * BYTE - offset) < length) {
73  			throw new UTGBException(UTGBErrorCode.INVALID_INPUT, String.format("invalid mask binary: buf size=%d (offset=%d), length=%d", sequenceMask.length,
74  					offset, length));
75  		}
76  	}
77  
78  	class KmerIterator implements OverlappingKmerIterator {
79  		private final int K;
80  		private final KmerIntegerFactory f;
81  		private int start;
82  
83  		public KmerIterator(int K) {
84  			this.K = K;
85  			this.f = new KmerIntegerFactory(K);
86  			int start = offset;
87  		}
88  
89  		public CompactACGT nextKmer() {
90  
91  			if (start + K > length)
92  				return null;
93  
94  			try {
95  				CompactACGT kmer = new CompactACGT(sequence, sequenceMask, K, CompactACGT.this.offset + start++);
96  				return kmer;
97  			}
98  			catch (UTGBException e) {
99  				throw new IllegalStateException(e);
100 			}
101 		}
102 
103 	}
104 
105 	/**
106 	 * return a subsequence of this ACGT string. The returned CompactACGT holds the references to the sequence byte
107 	 * arrays, and no buffer copy will be performed.
108 	 * 
109 	 * @param start
110 	 * @param length
111 	 * @return
112 	 * @throws UTGBException
113 	 */
114 	public CompactACGT subSequence(int start, int length) throws UTGBException {
115 		return new CompactACGT(sequence, sequenceMask, length, start + this.offset);
116 	}
117 
118 	public CompactACGT reverseComplement() throws UTGBException {
119 		ByteArrayOutputStream seqOut = new ByteArrayOutputStream(sequence.length);
120 		ByteArrayOutputStream nSeqOut = new ByteArrayOutputStream(sequenceMask.length);
121 		try {
122 
123 			CompactACGTWriter w = new CompactACGTWriter(seqOut, nSeqOut);
124 			// TODO byte-wise transformation
125 			for (int i = length - 1; i >= 0; i--) {
126 				int c = code2bitAt(i);
127 				c = ~c & 0x03;
128 				w.append2bit((byte) c);
129 			}
130 			w.close();
131 			return new CompactACGT(seqOut.toByteArray(), nSeqOut.toByteArray(), length, 0);
132 		}
133 		catch (IOException e) {
134 			throw UTGBException.convert(e);
135 		}
136 	}
137 
138 	public OverlappingKmerIterator getKmerIterator(int K) {
139 		return new KmerIterator(K);
140 	}
141 
142 	public static CompactACGT createFromString(String seq) throws IOException, UTGBException {
143 
144 		ByteArrayOutputStream seqOut = new ByteArrayOutputStream();
145 		ByteArrayOutputStream nSeqOut = new ByteArrayOutputStream();
146 		try {
147 			CompactACGTWriter w = new CompactACGTWriter(seqOut, nSeqOut);
148 			w.append(seq);
149 			w.close();
150 			return new CompactACGT(seqOut.toByteArray(), nSeqOut.toByteArray(), seq.length(), 0);
151 		}
152 		catch (IOException e) {
153 			throw UTGBException.convert(e);
154 		}
155 
156 	}
157 
158 	public int length() {
159 		return length;
160 	}
161 
162 	public char charAt(int index) {
163 		return ACGTEncoder.toBase(code2bitAt(index));
164 	}
165 
166 	/**
167 	 * Return 2bit code at the index on the genome. If the base character at the position is not an ACGT, returns -1
168 	 * 
169 	 * @param index
170 	 * @return 2bit code or -1 (when 'N')
171 	 */
172 	int code2bitAt(int index) {
173 		int x = index + offset;
174 
175 		int maskPos = x / 8;
176 		int maskOffset = x % 8;
177 		if ((sequenceMask[maskPos] >>> (7 - maskOffset) & 0x01) != 0)
178 			return -1;
179 
180 		int bPos = x / 4;
181 		int bOffset = x % 4;
182 
183 		return (sequence[bPos] >>> (6 - bOffset * 2)) & 0x03;
184 	}
185 
186 	@Override
187 	public String toString() {
188 		StringBuilder buf = new StringBuilder();
189 		for (int i = 0; i < length; i++) {
190 			buf.append(charAt(i));
191 		}
192 		return buf.toString();
193 	}
194 
195 }