List of usage examples for org.apache.hadoop.io.compress.bzip2 CBZip2InputStream CBZip2InputStream
public CBZip2InputStream(final InputStream in) throws IOException
From source file:Importer.java
License:Open Source License
public static void copyFile(File file) throws Exception { // String TEST_PREFIX = ""; File destFile = new File(outDir, file.getName() + ".seq"); Path dest = new Path(destFile.getAbsolutePath()); Configuration conf = new Configuration(); FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")), conf);//from w ww. jav a 2s. c o m CompressionCodec codec = new DefaultCodec(); fileSys.mkdirs(dest.getParent()); FSDataOutputStream outputStr = fileSys.create(dest); seqFileWriter = SequenceFile.createWriter(conf, outputStr, Text.class, Text.class, SequenceFile.CompressionType.BLOCK, codec); String filename = file.getName(); InputStream in = new BufferedInputStream(new FileInputStream(file)); if (filename.endsWith(".bz2")) { in.read(); in.read(); //snarf header in = new CBZip2InputStream(in); } BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII")); System.out.println("working on file " + file); int records = 0; long bytes = 0, bytes_since_status = 0; long startTime = System.currentTimeMillis(); String s = null; Text content = new Text(); while ((s = br.readLine()) != null) { if (s.startsWith("---END.OF.DOCUMENT---")) { Text name = new Text(hash(content)); seqFileWriter.append(name, content); records++; content = new Text(); } else { byte[] line_as_bytes = (s + " ").getBytes(); for (byte b : line_as_bytes) { assert b < 128 : "found an unexpected high-bit set"; } content.append(line_as_bytes, 0, line_as_bytes.length); bytes += line_as_bytes.length; /* bytes_since_status += line_as_bytes.length; if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB System.err.print('.'); bytes_since_status = 0; }*/ } } //end while if (content.getLength() > 5) { Text name = new Text(hash(content)); seqFileWriter.append(name, content); records++; } totalBytes += bytes; totalRecords += records; long time = (System.currentTimeMillis() - startTime) / 1000 + 1; long kbSec = bytes / 1024 / time; System.out.println(new java.util.Date()); System.out.println("File " + file.getName() + " " + records + " records, " + bytes + " bytes in " + time + " seconds (" + kbSec + " KB/sec)."); in.close(); seqFileWriter.close(); outputStr.close(); }
From source file:bme.iclef.hadoop.file2seq.TarToSeqFile.java
License:Apache License
private TarInputStream openInputFile() throws Exception { InputStream fileStream = new FileInputStream(inputFile); String name = inputFile.getName(); InputStream theStream = null; if (name.endsWith(".tar.gz") || name.endsWith(".tgz")) { theStream = new GZIPInputStream(fileStream); } else if (name.endsWith(".tar.bz2") || name.endsWith(".tbz2")) { /* Skip the "BZ" header added by bzip2. */ fileStream.skip(2);/*from w w w .j a v a 2 s . c o m*/ theStream = new CBZip2InputStream(fileStream); } else { /* Assume uncompressed tar file. */ theStream = fileStream; } return new TarInputStream(theStream); }
From source file:com.endgame.binarypig.util.BuildSequenceFileFromArchive.java
License:Apache License
public void load(FileSystem fs, Configuration conf, File archive, Path outputDir) throws Exception { Text key = new Text(); BytesWritable val = new BytesWritable(); SequenceFile.Writer writer = null; ArchiveInputStream archiveInputStream = null; try {/*w w w. j a v a 2 s . c om*/ Path sequenceName = new Path(outputDir, archive.getName() + ".seq"); System.out.println("Writing to " + sequenceName); writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class, BytesWritable.class, CompressionType.RECORD); String lowerName = archive.toString().toLowerCase(); if (lowerName.endsWith(".tar.gz") || lowerName.endsWith(".tgz")) { archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("tar", new GZIPInputStream(new FileInputStream(archive))); } else if (lowerName.endsWith(".tar.bz") || lowerName.endsWith(".tar.bz2") || lowerName.endsWith(".tbz")) { FileInputStream is = new FileInputStream(archive); is.read(); // read 'B' is.read(); // read 'Z' archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("tar", new CBZip2InputStream(is)); } else if (lowerName.endsWith(".tar")) { archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("tar", new FileInputStream(archive)); } else if (lowerName.endsWith(".zip")) { archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream("zip", new FileInputStream(archive)); } else { throw new RuntimeException("Can't handle archive format for: " + archive); } ArchiveEntry entry = null; while ((entry = archiveInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { try { byte[] outputFile = IOUtils.toByteArray(archiveInputStream); val.set(outputFile, 0, outputFile.length); key.set(DigestUtils.md5Hex(outputFile)); writer.append(key, val); } catch (IOException e) { System.err.println("Warning: archive may be truncated: " + archive); // Truncated Archive break; } } } } finally { archiveInputStream.close(); writer.close(); } }
From source file:com.mucommander.commons.file.impl.bzip2.Bzip2ArchiveFile.java
License:Open Source License
@Override public InputStream getEntryInputStream(ArchiveEntry entry, ArchiveEntryIterator entryIterator) throws IOException { try {//from w w w .j a va 2 s . c o m InputStream in = getInputStream(); // Skips the 2 magic bytes 'BZ', as required by CBZip2InputStream. Quoted from CBZip2InputStream's Javadoc: // "Although BZip2 headers are marked with the magic 'Bz'. this constructor expects the next byte in the // stream to be the first one after the magic. Thus callers have to skip the first two bytes. Otherwise // this constructor will throw an exception." // Note: the return value of read() is unchecked. In the unlikely event that EOF is reached in the first // 2 bytes, CBZip2InputStream will throw an IOException. in.read(); in.read(); // Quoted from CBZip2InputStream's Javadoc: // "CBZip2InputStream reads bytes from the compressed source stream via the single byte {@link java.io.InputStream#read() // read()} method exclusively. Thus you should consider to use a buffered source stream." return new CBZip2InputStream(new BufferedInputStream(in)); } catch (Exception e) { // CBZip2InputStream is known to throw NullPointerException if file is not properly Bzip2-encoded // so we need to catch those and throw them as IOException LOGGER.info("Exception caught while creating CBZip2InputStream, throwing IOException", e); throw new IOException(); } }
From source file:com.mucommander.commons.file.impl.tar.TarArchiveFile.java
License:Open Source License
/** * Returns a TarInputStream which can be used to read TAR entries. * * @param entryOffset offset from the start of the archive to an entry. Must be a multiple of recordSize, or * <code>0</code> to start at the first entry. * @return a TarInputStream which can be used to read TAR entries * @throws IOException if an error occurred while create the stream * @throws UnsupportedFileOperationException if this operation is not supported by the underlying filesystem, * or is not implemented./*from w w w . j a v a 2s . c om*/ */ private TarInputStream createTarStream(long entryOffset) throws IOException, UnsupportedFileOperationException { InputStream in = file.getInputStream(); String name = getName(); // Gzip-compressed file if (StringUtils.endsWithIgnoreCase(name, "tgz") || StringUtils.endsWithIgnoreCase(name, "tar.gz")) // Note: this will fail for gz/tgz entries inside a tar file (IOException: Not in GZIP format), // why is a complete mystery: the gz/tgz entry can be extracted and then properly browsed in = new GZIPInputStream(in); // Bzip2-compressed file else if (StringUtils.endsWithIgnoreCase(name, "tbz2") || StringUtils.endsWithIgnoreCase(name, "tar.bz2")) { try { // Skips the 2 magic bytes 'BZ', as required by CBZip2InputStream. Quoted from CBZip2InputStream's Javadoc: // "Although BZip2 headers are marked with the magic 'Bz'. this constructor expects the next byte in the // stream to be the first one after the magic. Thus callers have to skip the first two bytes. Otherwise // this constructor will throw an exception." StreamUtils.skipFully(in, 2); // Quoted from CBZip2InputStream's Javadoc: // "CBZip2InputStream reads bytes from the compressed source stream via the single byte {@link java.io.InputStream#read() // read()} method exclusively. Thus you should consider to use a buffered source stream." in = new CBZip2InputStream(new BufferedInputStream(in)); } catch (Exception e) { // CBZip2InputStream is known to throw NullPointerException if file is not properly Bzip2-encoded // so we need to catch those and throw them as IOException LOGGER.info("Exception caught while creating CBZip2InputStream, throwing IOException", e); throw new IOException(); } } return new TarInputStream(in, entryOffset); }
From source file:io.covert.binary.analysis.BuildSequenceFileFromTarball.java
License:Apache License
public void load(FileSystem fs, Configuration conf, File inputTarball, Path outputDir) throws Exception { Text key = new Text(); BytesWritable val = new BytesWritable(); Path sequenceName = new Path(outputDir, inputTarball.getName() + ".seq"); System.out.println("Writing to " + sequenceName); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class, BytesWritable.class, CompressionType.RECORD); InputStream is = new FileInputStream(inputTarball); if (inputTarball.toString().toLowerCase().endsWith(".gz")) { is = new GZIPInputStream(is); } else if (inputTarball.toString().toLowerCase().endsWith(".bz") || inputTarball.toString().endsWith(".bz2")) { is.read(); // read 'B' is.read(); // read 'Z' is = new CBZip2InputStream(is); }//from ww w . j a v a 2s . c o m final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory() .createArchiveInputStream("tar", is); TarArchiveEntry entry = null; while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { try { final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream(); IOUtils.copy(debInputStream, outputFileStream); outputFileStream.close(); byte[] outputFile = outputFileStream.toByteArray(); val.set(outputFile, 0, outputFile.length); MessageDigest md = MessageDigest.getInstance("MD5"); md.update(outputFile); byte[] digest = md.digest(); String hexdigest = ""; for (int i = 0; i < digest.length; i++) { hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1); } key.set(hexdigest); writer.append(key, val); } catch (IOException e) { System.err.println("Warning: tarball may be truncated: " + inputTarball); // Truncated Tarball break; } } } debInputStream.close(); writer.close(); }