View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // CompactACGTWriter.java
20  // Since: Feb 22, 2010
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.format.fasta;
26  
27  import java.io.IOException;
28  import java.io.OutputStream;
29  import java.util.ArrayList;
30  import java.util.Random;
31  
32  import org.utgenome.gwt.utgb.client.bio.ACGTEncoder;
33  import org.xerial.util.StringUtil;
34  
35  /**
36   * ACGT sequence compressor
37   * 
38   * @author leo
39   * 
40   */
41  
42  public class CompactACGTWriter {
43  
44  	private final OutputStream seqOut;
45  	private final OutputStream nSeqOut;
46  	private final int BUFFER_SIZE = 4096;
47  	private final byte[] seqBuffer = new byte[BUFFER_SIZE];
48  	private final byte[] nSeqBuffer = new byte[BUFFER_SIZE / 2];
49  	private int index = 0;
50  	private long length = 0;
51  	private final Random rand = new Random(17); // use fixed seed
52  
53  	public CompactACGTWriter(OutputStream seqOut, OutputStream nSeqOut) {
54  		this.seqOut = seqOut;
55  		this.nSeqOut = nSeqOut;
56  
57  		clearBuffer();
58  	}
59  
60  	private void clearBuffer() {
61  		for (int i = 0; i < seqBuffer.length; ++i)
62  			seqBuffer[i] = 0;
63  
64  		for (int i = 0; i < nSeqBuffer.length; ++i)
65  			nSeqBuffer[i] = 0;
66  	}
67  
68  	public long getSequenceLength() {
69  		return length;
70  	}
71  
72  	public void close() throws IOException {
73  		finish();
74  		seqOut.close();
75  		nSeqOut.close();
76  	}
77  
78  	private void finish() throws IOException {
79  		if (index <= 0)
80  			return;
81  
82  		seqOut.write(seqBuffer, 0, index / 4 + ((index % 4 > 0) ? 1 : 0));
83  		nSeqOut.write(nSeqBuffer, 0, index / 8 + ((index % 8 > 0) ? 1 : 0));
84  		index = 0;
85  
86  		seqOut.flush();
87  		nSeqOut.flush();
88  	}
89  
90  	void append2bit(byte code) throws IOException {
91  
92  		if (index >= BUFFER_SIZE * 4) {
93  			// dump the buffer 
94  			seqOut.write(seqBuffer, 0, BUFFER_SIZE);
95  			nSeqOut.write(nSeqBuffer, 0, BUFFER_SIZE / 2);
96  			clearBuffer();
97  			index = 0;
98  		}
99  
100 		int pos = index / 4;
101 		int offset = index % 4;
102 
103 		if (code >= 4) {
104 			code = (byte) rand.nextInt(4);
105 			nSeqBuffer[index / 8] |= (byte) (0x01 << (7 - (index % 8)));
106 		}
107 
108 		seqBuffer[pos] |= (byte) (code << (6 - offset * 2));
109 		index++;
110 		length++;
111 	}
112 
113 	public void append(String sequence) throws IOException {
114 		String t = sequence.trim();
115 		for (int i = 0; i < t.length(); ++i) {
116 			append2bit(ACGTEncoder.to2bitCode(t.charAt(i)));
117 		}
118 	}
119 
120 	public void append(char ch) throws IOException {
121 		append2bit(ACGTEncoder.to2bitCode(ch));
122 	}
123 
124 	public static byte to2bitCode(char acgt) {
125 		return ACGTEncoder.to2bitCode(acgt);
126 	}
127 
128 	/**
129 	 * This method is used to generate source code for the charToACGTCodeTable
130 	 */
131 	public static void generateCharTo2BitACGTTable() {
132 
133 		ArrayList<Byte> buffer = new ArrayList<Byte>();
134 		for (char c = 0; c < 256; ++c) {
135 			char u = Character.toUpperCase(c);
136 			byte code = 0;
137 			switch (u) {
138 			case 'A':
139 				code = 0;
140 				break;
141 			case 'C':
142 				code = 1;
143 				break;
144 			case 'G':
145 				code = 2;
146 				break;
147 			case 'T':
148 			case 'U':
149 				code = 3;
150 				break;
151 			default:
152 				code = 4;
153 				break;
154 			}
155 			if (buffer.size() >= 16) {
156 				System.out.println(StringUtil.join(buffer, ", ") + ", ");
157 				buffer.clear();
158 			}
159 			buffer.add(code);
160 		}
161 		if (!buffer.isEmpty()) {
162 			System.out.println(StringUtil.join(buffer, ", "));
163 		}
164 
165 	}
166 
167 }