1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.utgenome.util.aligner;
24
25 import java.util.ArrayList;
26 import java.util.HashMap;
27 import java.util.TreeSet;
28
29 import org.xerial.util.BitVector;
30 import org.xerial.util.StringUtil;
31
32
33
34
35
36
37
38 public class SuffixArrayBuilder {
39 private RandomAccess input;
40 private final int N;
41 private final int K;
42 private int[] bucket;
43 private BitVector typeLS;
44
45 public SuffixArrayBuilder(RandomAccess input, final int N, final int K) {
46 this.input = input;
47 this.N = N;
48 this.K = K;
49 this.bucket = new int[K + 1];
50 typeLS = new BitVector(N);
51 }
52
53 public SuffixArrayBuilder(String input) {
54 this.N = input.length() + 1;
55 StringWrapper w = new StringWrapper(input);
56 this.input = new SubArray(w.array, 0);
57 this.K = w.K;
58 this.bucket = new int[K + 1];
59 typeLS = new BitVector(N);
60 }
61
62 public static class StringWrapper implements RandomAccess {
63 public final int[] array;
64 public final int K;
65
66 public StringWrapper(String s) {
67 array = new int[s.length() + 1];
68
69 TreeSet<Character> inputDomain = new TreeSet<Character>();
70 for (int i = 0; i < s.length(); ++i) {
71 char ch = s.charAt(i);
72 inputDomain.add(ch);
73 }
74
75 HashMap<Character, Integer> codeTable = new HashMap<Character, Integer>();
76 int codeCount = 1;
77 for (char ch : inputDomain) {
78 if (!codeTable.containsKey(ch)) {
79 codeTable.put(ch, codeCount++);
80 }
81 }
82
83
84 for (int i = 0; i < s.length(); ++i) {
85 array[i] = codeTable.get(s.charAt(i));
86 }
87 array[s.length()] = 0;
88 this.K = codeTable.size() + 1;
89 }
90
91 public int get(long index) {
92 return array[(int) index];
93 }
94
95 public void set(long index, int value) {
96 array[(int) index] = value;
97 }
98 }
99
100 public static class SubArray implements RandomAccess {
101
102 private final int[] orig;
103 private final int offset;
104
105 public SubArray(final int[] orig, int offset) {
106 this.orig = orig;
107 this.offset = offset;
108 }
109
110 public int get(long index) {
111 return orig[(int) index + offset];
112 }
113
114 public void set(long index, int value) {
115 orig[(int) index + offset] = value;
116 }
117
118 @Override
119 public String toString() {
120 ArrayList<Integer> v = new ArrayList<Integer>();
121 for (int i = offset; i < orig.length; ++i)
122 v.add(orig[i]);
123 return StringUtil.join(v, ", ");
124
125 }
126 }
127
128 public int[] SAIS() {
129 int[] result = new int[N];
130 SAIS(result);
131 return result;
132 }
133
134 public void SAIS(int[] SA) {
135 typeLS.set(N - 2, false);
136 typeLS.set(N - 1, true);
137
138
139 for (int i = N - 3; i >= 0; --i) {
140 typeLS.set(i, input.get(i) < input.get(i + 1) || (input.get(i) == input.get(i + 1) && typeLS.get(i + 1)));
141 }
142
143
144
145
146
147 findEndOfBuckets();
148
149
150 for (int i = 0; i < N; ++i)
151 SA[i] = -1;
152
153 for (int i = 1; i < N; ++i) {
154 if (isLMS(i))
155 SA[--bucket[input.get(i)]] = i;
156 }
157
158 induceSA_left(SA);
159 induceSA_right(SA);
160
161
162
163 int N1 = 0;
164 for (int i = 0; i < N; ++i) {
165 if (isLMS(SA[i]))
166 SA[N1++] = SA[i];
167 }
168
169
170 for (int i = N1; i < N; ++i)
171 SA[i] = -1;
172
173 int name = 0;
174 int prev = -1;
175 for (int i = 0; i < N1; i++) {
176 final int pos = SA[i];
177 final int plen = SA[N1 + (pos >> 1)];
178 boolean diff = false;
179
180 for (int d = 0; d < N; ++d) {
181 if (prev == -1 || input.get(pos + d) != input.get(prev + d) || typeLS.get(pos + d) != typeLS.get(prev + d)) {
182 diff = true;
183 break;
184 }
185 else if (d > 0 && (isLMS(pos + d) || isLMS(prev + d)))
186 break;
187 }
188
189 if (diff) {
190 name++;
191 prev = pos;
192 }
193
194 SA[N1 + (pos >> 1)] = name - 1;
195 }
196
197 for (int i = N - 1, j = N - 1; i >= N1; --i) {
198 if (SA[i] >= 0)
199 SA[j--] = SA[i];
200 }
201
202
203 int SA1[] = SA;
204 RandomAccess inputS1 = new SubArray(SA, N - N1);
205 if (name < N1) {
206 new SuffixArrayBuilder(inputS1, N1, name - 1).SAIS(SA1);
207 }
208 else {
209
210 for (int i = 0; i < N1; i++)
211 SA1[inputS1.get(i)] = i;
212 }
213
214
215 findEndOfBuckets();
216 for (int i = 1, j = 0; i < N; ++i) {
217 if (isLMS(i))
218 inputS1.set(j++, i);
219 }
220 for (int i = 0; i < N1; ++i) {
221 SA1[i] = inputS1.get(SA1[i]);
222 }
223
224 for (int i = N1; i < N; ++i) {
225 SA[i] = -1;
226 }
227 for (int i = N1 - 1; i >= 0; --i) {
228 int j = SA[i];
229 SA[i] = -1;
230 SA[--bucket[input.get(j)]] = j;
231 }
232 induceSA_left(SA);
233 induceSA_right(SA);
234
235 }
236
237 private void findStartOfBuckets() {
238 initBuckets();
239
240 int sum = 0;
241 for (int i = 0; i <= K; ++i) {
242 sum += bucket[i];
243 bucket[i] = sum - bucket[i];
244 }
245 }
246
247 private void findEndOfBuckets() {
248 initBuckets();
249
250 int sum = 0;
251 for (int i = 0; i <= K; ++i) {
252 sum += bucket[i];
253 bucket[i] = sum;
254 }
255 }
256
257 private void initBuckets() {
258
259 for (int i = 0; i <= K; ++i) {
260 bucket[i] = 0;
261 }
262
263 for (int i = 0; i < N; ++i) {
264 bucket[input.get(i)]++;
265 }
266 }
267
268 boolean isLMS(int pos) {
269 return pos > 0 && typeLS.get(pos) && !typeLS.get(pos - 1);
270 }
271
272 private void induceSA_left(int[] SA) {
273 findStartOfBuckets();
274 int j;
275 for (int i = 0; i < N; ++i) {
276 j = SA[i] - 1;
277 if (j >= 0 && !typeLS.get(j))
278 SA[bucket[input.get(j)]++] = j;
279 }
280 }
281
282 private void induceSA_right(int[] SA) {
283 findEndOfBuckets();
284 int j;
285 for (int i = N - 1; i >= 0; --i) {
286 j = SA[i] - 1;
287 if (j >= 0 && typeLS.get(j))
288 SA[--bucket[input.get(j)]] = j;
289 }
290 }
291
292 }