View Javadoc

1   /*--------------------------------------------------------------------------
2    *  Copyright 2009 utgenome.org
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *--------------------------------------------------------------------------*/
16  //--------------------------------------
17  // utgb-core Project
18  //
19  // CompactFASTAGenerator.java
20  // Since: Feb 22, 2010
21  //
22  // $URL$ 
23  // $Author$
24  //--------------------------------------
25  package org.utgenome.format.fasta;
26  
27  import java.io.BufferedOutputStream;
28  import java.io.BufferedWriter;
29  import java.io.File;
30  import java.io.FileInputStream;
31  import java.io.FileOutputStream;
32  import java.io.FileWriter;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.net.URL;
36  
37  import org.xerial.silk.SilkWriter;
38  import org.xerial.util.FileType;
39  import org.xerial.util.log.Logger;
40  
41  /**
42   * 
43   * 
44   * @author leo
45   * 
46   */
47  public class CompactFASTAGenerator {
48  
49  	private static Logger _logger = Logger.getLogger(CompactFASTAGenerator.class);
50  
51  	private CompactACGTWriter compressor;
52  	private SilkWriter indexOut;
53  	private String workDir = "target";
54  
55  	public int BUFFER_SIZE = 8 * 1024 * 1024;
56  
57  	public CompactFASTAGenerator() {
58  
59  	}
60  
61  	public void setBuffferSize(int bufferSizeInMB) {
62  		BUFFER_SIZE = bufferSizeInMB * 1024 * 1024;
63  	}
64  
65  	public void setWorkDir(String workDir) {
66  		this.workDir = workDir;
67  	}
68  
69  	public void setWorkDir(File workDir) {
70  		this.workDir = workDir.getAbsolutePath();
71  	}
72  
73  	public String getWorkDir() {
74  		return workDir;
75  	}
76  
77  	public void packFASTA(String fastaFilePath) throws IOException {
78  		packFASTA(fastaFilePath, new FileInputStream(fastaFilePath));
79  	}
80  
81  	public void packFASTA(URL fastaFile) throws IOException {
82  		packFASTA(fastaFile.getPath(), fastaFile.openStream());
83  	}
84  
85  	public void packFASTA(String fastaFilePrefix, InputStream in) throws IOException {
86  
87  		File work = new File(workDir);
88  		if (!work.exists()) {
89  			_logger.info("create a directory: " + work);
90  			work.mkdirs();
91  		}
92  
93  		String fileName = new File(fastaFilePrefix).getName();
94  		String baseName = fileName.endsWith(".fa") ? fileName : FileType.removeFileExt(fileName);
95  
96  		// output files
97  		String pacSeqFile = baseName + CompactFASTA.PAC_FILE_SUFFIX;
98  		String pacNSeqFile = baseName + CompactFASTA.PAC_N_FILE_SUFFIX;
99  		String pacIndexFile = baseName + CompactFASTA.PAC_INDEX_FILE_SUFFIX;
100 		_logger.info("pac file: " + pacSeqFile);
101 		_logger.info("pac file for N: " + pacNSeqFile);
102 		_logger.info("pac index file: " + pacIndexFile);
103 
104 		BufferedOutputStream pacSeqOut = new BufferedOutputStream(new FileOutputStream(new File(workDir, pacSeqFile)), BUFFER_SIZE);
105 		BufferedOutputStream pacNSeqOut = new BufferedOutputStream(new FileOutputStream(new File(workDir, pacNSeqFile)), BUFFER_SIZE);
106 		compressor = new CompactACGTWriter(pacSeqOut, pacNSeqOut);
107 		indexOut = new SilkWriter(new BufferedWriter(new FileWriter(new File(workDir, pacIndexFile))));
108 
109 		indexOut.preamble();
110 		indexOut.schema("sequence").attribute("name").attribute("description").attribute("length").attribute("offset");
111 
112 		// load FASTA file (.fa, .fa.tar.gz, ...)
113 		packFASTA(new FASTAPullParser(fileName, in, BUFFER_SIZE));
114 
115 		compressor.close();
116 		indexOut.close();
117 
118 		_logger.info("pack done.");
119 
120 	}
121 
122 	private void packFASTA(FASTAPullParser fastaParser) throws IOException {
123 		String description;
124 		while ((description = fastaParser.nextDescriptionLine()) != null) {
125 
126 			String sequenceName = CompactFASTA.pickSequenceName(description);
127 			_logger.info(String.format("loading %s ...", sequenceName));
128 			long start = compressor.getSequenceLength();
129 
130 			String seq = null;
131 			while ((seq = fastaParser.nextSequenceLine()) != null) {
132 				compressor.append(seq);
133 			}
134 
135 			long end = compressor.getSequenceLength();
136 			long sequenceLength = end - start;
137 
138 			SilkWriter s = indexOut.node("sequence").attribute("name", sequenceName);
139 			s.leaf("description", description);
140 			s.leaf("length", Long.toString(sequenceLength));
141 			s.leaf("offset", Long.toString(start));
142 		}
143 
144 	}
145 
146 }