View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2010 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // SuffixArrayBuilder.java
20  // Since: 2010/10/27
21  //
22  //--------------------------------------
23  package org.utgenome.util.aligner;
24  
25  import java.util.ArrayList;
26  import java.util.HashMap;
27  import java.util.TreeSet;
28  
29  import org.xerial.util.BitVector;
30  import org.xerial.util.StringUtil;
31  
32  /**
33   * SA-IS implementation for building suffix arrays
34   * 
35   * @author leo
36   * 
37   */
38  public class SuffixArrayBuilder {
39  	private RandomAccess input;
40  	private final int N;
41  	private final int K;
42  	private int[] bucket;
43  	private BitVector typeLS;
44  
45  	public SuffixArrayBuilder(RandomAccess input, final int N, final int K) {
46  		this.input = input;
47  		this.N = N;
48  		this.K = K;
49  		this.bucket = new int[K + 1];
50  		typeLS = new BitVector(N);
51  	}
52  
53  	public SuffixArrayBuilder(String input) {
54  		this.N = input.length() + 1;
55  		StringWrapper w = new StringWrapper(input);
56  		this.input = new SubArray(w.array, 0);
57  		this.K = w.K;
58  		this.bucket = new int[K + 1];
59  		typeLS = new BitVector(N);
60  	}
61  
62  	public static class StringWrapper implements RandomAccess {
63  		public final int[] array;
64  		public final int K;
65  
66  		public StringWrapper(String s) {
67  			array = new int[s.length() + 1];
68  
69  			TreeSet<Character> inputDomain = new TreeSet<Character>();
70  			for (int i = 0; i < s.length(); ++i) {
71  				char ch = s.charAt(i);
72  				inputDomain.add(ch);
73  			}
74  			// assign code ID
75  			HashMap<Character, Integer> codeTable = new HashMap<Character, Integer>();
76  			int codeCount = 1;
77  			for (char ch : inputDomain) {
78  				if (!codeTable.containsKey(ch)) {
79  					codeTable.put(ch, codeCount++);
80  				}
81  			}
82  
83  			// translate to int array
84  			for (int i = 0; i < s.length(); ++i) {
85  				array[i] = codeTable.get(s.charAt(i));
86  			}
87  			array[s.length()] = 0;
88  			this.K = codeTable.size() + 1;
89  		}
90  
91  		public int get(long index) {
92  			return array[(int) index];
93  		}
94  
95  		public void set(long index, int value) {
96  			array[(int) index] = value;
97  		}
98  	}
99  
100 	public static class SubArray implements RandomAccess {
101 
102 		private final int[] orig;
103 		private final int offset;
104 
105 		public SubArray(final int[] orig, int offset) {
106 			this.orig = orig;
107 			this.offset = offset;
108 		}
109 
110 		public int get(long index) {
111 			return orig[(int) index + offset];
112 		}
113 
114 		public void set(long index, int value) {
115 			orig[(int) index + offset] = value;
116 		}
117 
118 		@Override
119 		public String toString() {
120 			ArrayList<Integer> v = new ArrayList<Integer>();
121 			for (int i = offset; i < orig.length; ++i)
122 				v.add(orig[i]);
123 			return StringUtil.join(v, ", ");
124 
125 		}
126 	}
127 
128 	public int[] SAIS() {
129 		int[] result = new int[N];
130 		SAIS(result);
131 		return result;
132 	}
133 
134 	public void SAIS(int[] SA) {
135 		typeLS.set(N - 2, false);
136 		typeLS.set(N - 1, true); // the sentinel 
137 
138 		// set the type of each character
139 		for (int i = N - 3; i >= 0; --i) {
140 			typeLS.set(i, input.get(i) < input.get(i + 1) || (input.get(i) == input.get(i + 1) && typeLS.get(i + 1)));
141 		}
142 
143 		// Step 1: reduce the problem by at least 1/2 
144 		// sort all the S-substrings
145 
146 		// create a bucket array
147 		findEndOfBuckets();
148 
149 		// initialize the suffix array
150 		for (int i = 0; i < N; ++i)
151 			SA[i] = -1;
152 
153 		for (int i = 1; i < N; ++i) {
154 			if (isLMS(i))
155 				SA[--bucket[input.get(i)]] = i;
156 		}
157 
158 		induceSA_left(SA);
159 		induceSA_right(SA);
160 
161 		// Compact all the sorted subtrings into the first N1 items of SA
162 		// 2*n1 must be not larger than N 
163 		int N1 = 0;
164 		for (int i = 0; i < N; ++i) {
165 			if (isLMS(SA[i]))
166 				SA[N1++] = SA[i];
167 		}
168 
169 		// init the name array buffer
170 		for (int i = N1; i < N; ++i)
171 			SA[i] = -1;
172 		// find the lexicographic names of substrings
173 		int name = 0;
174 		int prev = -1;
175 		for (int i = 0; i < N1; i++) {
176 			final int pos = SA[i];
177 			final int plen = SA[N1 + (pos >> 1)];
178 			boolean diff = false;
179 
180 			for (int d = 0; d < N; ++d) {
181 				if (prev == -1 || input.get(pos + d) != input.get(prev + d) || typeLS.get(pos + d) != typeLS.get(prev + d)) {
182 					diff = true;
183 					break;
184 				}
185 				else if (d > 0 && (isLMS(pos + d) || isLMS(prev + d)))
186 					break;
187 			}
188 
189 			if (diff) {
190 				name++;
191 				prev = pos;
192 			}
193 
194 			SA[N1 + (pos >> 1)] = name - 1;
195 		}
196 
197 		for (int i = N - 1, j = N - 1; i >= N1; --i) {
198 			if (SA[i] >= 0)
199 				SA[j--] = SA[i];
200 		}
201 
202 		// Step 2: solve the reduced problem
203 		int SA1[] = SA;
204 		RandomAccess inputS1 = new SubArray(SA, N - N1);
205 		if (name < N1) {
206 			new SuffixArrayBuilder(inputS1, N1, name - 1).SAIS(SA1);
207 		}
208 		else {
209 			// Generate the suffix array of inputS1 directory.
210 			for (int i = 0; i < N1; i++)
211 				SA1[inputS1.get(i)] = i;
212 		}
213 
214 		// Step 3: Induce the result for the original problem
215 		findEndOfBuckets();
216 		for (int i = 1, j = 0; i < N; ++i) {
217 			if (isLMS(i))
218 				inputS1.set(j++, i); // get p1
219 		}
220 		for (int i = 0; i < N1; ++i) {
221 			SA1[i] = inputS1.get(SA1[i]);
222 		}
223 		// init SA[N1 .. N-1]
224 		for (int i = N1; i < N; ++i) {
225 			SA[i] = -1;
226 		}
227 		for (int i = N1 - 1; i >= 0; --i) {
228 			int j = SA[i];
229 			SA[i] = -1;
230 			SA[--bucket[input.get(j)]] = j;
231 		}
232 		induceSA_left(SA);
233 		induceSA_right(SA);
234 
235 	}
236 
237 	private void findStartOfBuckets() {
238 		initBuckets();
239 		// compute the start of the buckets
240 		int sum = 0;
241 		for (int i = 0; i <= K; ++i) {
242 			sum += bucket[i];
243 			bucket[i] = sum - bucket[i];
244 		}
245 	}
246 
247 	private void findEndOfBuckets() {
248 		initBuckets();
249 		// compute the end of the buckets
250 		int sum = 0;
251 		for (int i = 0; i <= K; ++i) {
252 			sum += bucket[i];
253 			bucket[i] = sum;
254 		}
255 	}
256 
257 	private void initBuckets() {
258 		// initialize buckets
259 		for (int i = 0; i <= K; ++i) {
260 			bucket[i] = 0;
261 		}
262 		// compute the size of each bucket
263 		for (int i = 0; i < N; ++i) {
264 			bucket[input.get(i)]++;
265 		}
266 	}
267 
268 	boolean isLMS(int pos) {
269 		return pos > 0 && typeLS.get(pos) && !typeLS.get(pos - 1);
270 	}
271 
272 	private void induceSA_left(int[] SA) {
273 		findStartOfBuckets();
274 		int j;
275 		for (int i = 0; i < N; ++i) {
276 			j = SA[i] - 1;
277 			if (j >= 0 && !typeLS.get(j))
278 				SA[bucket[input.get(j)]++] = j;
279 		}
280 	}
281 
282 	private void induceSA_right(int[] SA) {
283 		findEndOfBuckets();
284 		int j;
285 		for (int i = N - 1; i >= 0; --i) {
286 			j = SA[i] - 1;
287 			if (j >= 0 && typeLS.get(j))
288 				SA[--bucket[input.get(j)]] = j;
289 		}
290 	}
291 
292 }