1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.utgenome.format.fasta;
26
27 import java.io.BufferedOutputStream;
28 import java.io.BufferedWriter;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.FileOutputStream;
32 import java.io.FileWriter;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.net.URL;
36
37 import org.xerial.silk.SilkWriter;
38 import org.xerial.util.FileType;
39 import org.xerial.util.log.Logger;
40
41
42
43
44
45
46
47 public class CompactFASTAGenerator {
48
49 private static Logger _logger = Logger.getLogger(CompactFASTAGenerator.class);
50
51 private CompactACGTWriter compressor;
52 private SilkWriter indexOut;
53 private String workDir = "target";
54
55 public int BUFFER_SIZE = 8 * 1024 * 1024;
56
57 public CompactFASTAGenerator() {
58
59 }
60
61 public void setBuffferSize(int bufferSizeInMB) {
62 BUFFER_SIZE = bufferSizeInMB * 1024 * 1024;
63 }
64
65 public void setWorkDir(String workDir) {
66 this.workDir = workDir;
67 }
68
69 public void setWorkDir(File workDir) {
70 this.workDir = workDir.getAbsolutePath();
71 }
72
73 public String getWorkDir() {
74 return workDir;
75 }
76
77 public void packFASTA(String fastaFilePath) throws IOException {
78 packFASTA(fastaFilePath, new FileInputStream(fastaFilePath));
79 }
80
81 public void packFASTA(URL fastaFile) throws IOException {
82 packFASTA(fastaFile.getPath(), fastaFile.openStream());
83 }
84
85 public void packFASTA(String fastaFilePrefix, InputStream in) throws IOException {
86
87 File work = new File(workDir);
88 if (!work.exists()) {
89 _logger.info("create a directory: " + work);
90 work.mkdirs();
91 }
92
93 String fileName = new File(fastaFilePrefix).getName();
94 String baseName = fileName.endsWith(".fa") ? fileName : FileType.removeFileExt(fileName);
95
96
97 String pacSeqFile = baseName + CompactFASTA.PAC_FILE_SUFFIX;
98 String pacNSeqFile = baseName + CompactFASTA.PAC_N_FILE_SUFFIX;
99 String pacIndexFile = baseName + CompactFASTA.PAC_INDEX_FILE_SUFFIX;
100 _logger.info("pac file: " + pacSeqFile);
101 _logger.info("pac file for N: " + pacNSeqFile);
102 _logger.info("pac index file: " + pacIndexFile);
103
104 BufferedOutputStream pacSeqOut = new BufferedOutputStream(new FileOutputStream(new File(workDir, pacSeqFile)), BUFFER_SIZE);
105 BufferedOutputStream pacNSeqOut = new BufferedOutputStream(new FileOutputStream(new File(workDir, pacNSeqFile)), BUFFER_SIZE);
106 compressor = new CompactACGTWriter(pacSeqOut, pacNSeqOut);
107 indexOut = new SilkWriter(new BufferedWriter(new FileWriter(new File(workDir, pacIndexFile))));
108
109 indexOut.preamble();
110 indexOut.schema("sequence").attribute("name").attribute("description").attribute("length").attribute("offset");
111
112
113 packFASTA(new FASTAPullParser(fileName, in, BUFFER_SIZE));
114
115 compressor.close();
116 indexOut.close();
117
118 _logger.info("pack done.");
119
120 }
121
122 private void packFASTA(FASTAPullParser fastaParser) throws IOException {
123 String description;
124 while ((description = fastaParser.nextDescriptionLine()) != null) {
125
126 String sequenceName = CompactFASTA.pickSequenceName(description);
127 _logger.info(String.format("loading %s ...", sequenceName));
128 long start = compressor.getSequenceLength();
129
130 String seq = null;
131 while ((seq = fastaParser.nextSequenceLine()) != null) {
132 compressor.append(seq);
133 }
134
135 long end = compressor.getSequenceLength();
136 long sequenceLength = end - start;
137
138 SilkWriter s = indexOut.node("sequence").attribute("name", sequenceName);
139 s.leaf("description", description);
140 s.leaf("length", Long.toString(sequenceLength));
141 s.leaf("offset", Long.toString(start));
142 }
143
144 }
145
146 }