View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2010 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // ACGT.java
20  // Since: 2010/10/07
21  //
22  //--------------------------------------
23  package org.utgenome.gwt.utgb.client.bio;
24  
25  import org.utgenome.format.fasta.CompactACGTWriter;
26  
27  /**
28   * Nucleotide ACGT <-> 2bit code conversion utility
29   * 
30   * @author leo
31   * 
32   */
33  public class ACGTEncoder {
34  
35  	/**
36  	 * table for translating ASCII code to Nucleotide in 2 bit. (4 is for N); This table can be generated by
37  	 * {@link CompactACGTWriter#generateCharTo2BitACGTTable()}.
38  	 */
39  	private final static byte[] charToACGTCodeTable = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
40  			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
41  			4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
42  			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
43  			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
44  			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
45  
46  	private static char[] acgt = { 'A', 'C', 'G', 'T' };
47  
48  	public static byte to2bitCode(char base) {
49  		return charToACGTCodeTable[base];
50  	}
51  
52  	public static char toBase(int code) {
53  		if (code < 0 || code > 3)
54  			return 'N';
55  		else
56  			return acgt[code & 0x03];
57  	}
58  
59  	/**
60  	 * Return the k-mer interger of the given ACGT sequence
61  	 * 
62  	 * @param K
63  	 * @param acgt
64  	 * @return k-mer integer of the sequence, or -1 when N or the other invalid characters are contained in the sequence
65  	 */
66  	public static int toKmerInt(final int K, String acgt) {
67  		int kmer = 0;
68  
69  		final int max = Math.min(K, acgt.length());
70  		for (int i = 0; i < max; i++) {
71  			byte b = to2bitCode(acgt.charAt(i));
72  			if (b >= 4)
73  				return -1;
74  
75  			kmer <<= 2;
76  			kmer |= b;
77  		}
78  
79  		return kmer;
80  	}
81  
82  	public static String toString(int kmerInt, int K) {
83  		StringBuilder seq = new StringBuilder();
84  		for (int i = 0; i < K; i++) {
85  			int index = (kmerInt >>> (2 * (K - i - 1))) & 0x03;
86  			seq.append(acgt[index]);
87  		}
88  		return seq.toString();
89  	}
90  
91  }