List of usage examples for org.apache.commons.compress.archivers.tar TarArchiveEntry TarArchiveEntry
public TarArchiveEntry(byte[] headerBuf)
From source file:org.apache.nutch.tools.CommonCrawlDataDumper.java
/** * Dumps the reverse engineered CBOR content from the provided segment * directories if a parent directory contains more than one segment, * otherwise a single segment can be passed as an argument. If the boolean * argument is provided then the CBOR is also zipped. * * @param outputDir the directory you wish to dump the raw content to. This * directory will be created. * @param segmentRootDir a directory containing one or more segments. * @param linkdb Path to linkdb. * @param gzip a boolean flag indicating whether the CBOR content should also * be gzipped./*from w ww. j av a 2 s. com*/ * @param epochFilename if {@code true}, output files will be names using the epoch time (in milliseconds). * @param extension a file extension to use with output documents. * @throws Exception if any exception occurs. */ public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception { if (gzip) { LOG.info("Gzipping CBOR data has been skipped"); } // total file counts Map<String, Integer> typeCounts = new HashMap<>(); // filtered file counters Map<String, Integer> filteredCounts = new HashMap<>(); Configuration nutchConfig = NutchConfiguration.create(); Path segmentRootPath = new Path(segmentRootDir.toString()); FileSystem fs = segmentRootPath.getFileSystem(nutchConfig); //get all paths List<Path> parts = new ArrayList<>(); RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true); String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data"; while (files.hasNext()) { LocatedFileStatus next = files.next(); if (next.isFile()) { Path path = next.getPath(); if (path.toString().matches(partPattern)) { parts.add(path); } } } LinkDbReader linkDbReader = null; if (linkdb != null) { linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString())); } if (parts == null || parts.size() == 0) { LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath()); System.exit(1); } LOG.info("Found {} segment parts", parts.size()); if (gzip && !warc) { fileList = new ArrayList<>(); constructNewStream(outputDir); } for (Path segmentPart : parts) { LOG.info("Processing segment Part : [ {} ]", segmentPart); try { SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(segmentPart)); Writable key = (Writable) reader.getKeyClass().newInstance(); Content content = null; while (reader.next(key)) { content = new Content(); reader.getCurrentValue(content); Metadata metadata = content.getMetadata(); String url = key.toString(); String baseName = FilenameUtils.getBaseName(url); String extensionName = FilenameUtils.getExtension(url); if (!extension.isEmpty()) { extensionName = extension; } else if ((extensionName == null) || extensionName.isEmpty()) { extensionName = "html"; } String outputFullPath = null; String outputRelativePath = null; String filename = null; String timestamp = null; String reverseKey = null; if (epochFilename || config.getReverseKey()) { try { long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z") .parse(getDate(metadata.get("Date"))).getTime(); timestamp = String.valueOf(epoch); } catch (ParseException pe) { LOG.warn(pe.getMessage()); } reverseKey = reverseUrl(url); config.setReverseKeyValue( reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp); } if (!warc) { if (epochFilename) { outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip); outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1); filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName; } else { String md5Ofurl = DumpFileUtil.getUrlMD5(url); String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip); filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName); outputFullPath = String.format("%s/%s", fullDir, filename); String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator)); String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2]; String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1]; outputRelativePath = firstLevelDirName + secondLevelDirName; } } // Encode all filetypes if no mimetypes have been given Boolean filter = (mimeTypes == null); String jsonData = ""; try { String mimeType = new Tika().detect(content.getContent()); // Maps file to JSON-based structure Set<String> inUrls = null; //there may be duplicates, so using set if (linkDbReader != null) { Inlinks inlinks = linkDbReader.getInlinks((Text) key); if (inlinks != null) { Iterator<Inlink> iterator = inlinks.iterator(); inUrls = new LinkedHashSet<>(); while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) { inUrls.add(iterator.next().getFromUrl()); } } } //TODO: Make this Jackson Format implementation reusable try (CommonCrawlFormat format = CommonCrawlFormatFactory .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) { if (inUrls != null) { format.setInLinks(new ArrayList<>(inUrls)); } jsonData = format.getJsonData(url, content, metadata); } collectStats(typeCounts, mimeType); // collects statistics for the given mimetypes if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) { collectStats(filteredCounts, mimeType); filter = true; } } catch (IOException ioe) { LOG.error("Fatal error in creating JSON data: " + ioe.getMessage()); return; } if (!warc) { if (filter) { byte[] byteData = serializeCBORData(jsonData); if (!gzip) { File outputFile = new File(outputFullPath); if (outputFile.exists()) { LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists"); } else { LOG.info("Writing: [" + outputFullPath + "]"); IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile)); } } else { if (fileList.contains(outputFullPath)) { LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists"); } else { fileList.add(outputFullPath); LOG.info("Compressing: [" + outputFullPath + "]"); //TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename); TarArchiveEntry tarEntry = new TarArchiveEntry( outputRelativePath + File.separator + filename); tarEntry.setSize(byteData.length); tarOutput.putArchiveEntry(tarEntry); tarOutput.write(byteData); tarOutput.closeArchiveEntry(); } } } } } reader.close(); } catch (Exception e) { LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage()); } finally { fs.close(); } } if (gzip && !warc) { closeStream(); } if (!typeCounts.isEmpty()) { LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts)); } }
From source file:org.apache.openaz.xacml.admin.components.PolicyWorkspace.java
@Override public InputStream getStream() { ///* www .j a v a2 s . c o m*/ // Grab our working repository // final Path repoPath = ((XacmlAdminUI) getUI()).getUserGitPath(); Path workspacePath = ((XacmlAdminUI) getUI()).getUserWorkspace(); final Path tarFile = Paths.get(workspacePath.toString(), "Repository.tgz"); try (OutputStream os = Files.newOutputStream(tarFile)) { try (GzipCompressorOutputStream gzOut = new GzipCompressorOutputStream(os)) { try (TarArchiveOutputStream tarOut = new TarArchiveOutputStream(gzOut)) { tarOut.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU); Files.walkFileTree(repoPath, new SimpleFileVisitor<Path>() { @Override public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { if (dir.getFileName().toString().startsWith(".git")) { return FileVisitResult.SKIP_SUBTREE; } Path relative = repoPath.relativize(dir); if (relative.toString().isEmpty()) { return super.preVisitDirectory(dir, attrs); } TarArchiveEntry entry = new TarArchiveEntry(relative.toFile()); tarOut.putArchiveEntry(entry); tarOut.closeArchiveEntry(); return super.preVisitDirectory(dir, attrs); } @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { if (file.getFileName().toString().endsWith(".xml") == false) { return super.visitFile(file, attrs); } Path relative = repoPath.relativize(file); TarArchiveEntry entry = new TarArchiveEntry(relative.toFile()); entry.setSize(Files.size(file)); tarOut.putArchiveEntry(entry); try { IOUtils.copy(Files.newInputStream(file), tarOut); } catch (IOException e) { logger.error(e); } tarOut.closeArchiveEntry(); return super.visitFile(file, attrs); } }); tarOut.finish(); } } } catch (IOException e) { logger.error(e); } try { return Files.newInputStream(tarFile); } catch (IOException e) { logger.error(e); } return null; }
From source file:org.apache.tika.server.TarWriter.java
private static void tarStoreBuffer(TarArchiveOutputStream zip, String name, byte[] dataBuffer) throws IOException { TarArchiveEntry entry = new TarArchiveEntry(name); entry.setSize(dataBuffer.length);//from ww w. j a v a 2s . co m zip.putArchiveEntry(entry); zip.write(dataBuffer); zip.closeArchiveEntry(); }
From source file:org.artifactory.util.ArchiveUtils.java
/** * Use for writing streams - must specify file size in advance as well *//* w ww .j av a 2 s .co m*/ public static ArchiveEntry createArchiveEntry(String relativePath, ArchiveType archiveType, long size) { switch (archiveType) { case ZIP: ZipArchiveEntry zipEntry = new ZipArchiveEntry(relativePath); zipEntry.setSize(size); return zipEntry; case TAR: case TARGZ: case TGZ: TarArchiveEntry tarEntry = new TarArchiveEntry(relativePath); tarEntry.setSize(size); return tarEntry; } throw new IllegalArgumentException("Unsupported archive type: '" + archiveType + "'"); }
From source file:org.codehaus.plexus.archiver.tar.TarRoundTripTest.java
/** * test round-tripping long (GNU) entries *///from w w w . j a va 2s . c o m public void testLongRoundTripping() throws IOException { TarArchiveEntry original = new TarArchiveEntry(LONG_NAME); assertEquals("over 100 chars", true, LONG_NAME.length() > 100); assertEquals("original name", LONG_NAME, original.getName()); ByteArrayOutputStream buff = new ByteArrayOutputStream(); TarArchiveOutputStream tos = new TarArchiveOutputStream(buff); tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU); tos.putArchiveEntry(original); tos.closeArchiveEntry(); tos.close(); TarArchiveInputStream tis = new TarArchiveInputStream(new ByteArrayInputStream(buff.toByteArray())); TarArchiveEntry tripped = tis.getNextTarEntry(); assertEquals("round-tripped name", LONG_NAME, tripped.getName()); assertNull("no more entries", tis.getNextEntry()); tis.close(); }
From source file:org.dcm4chee.storage.tar.TarContainerProvider.java
@Override public void writeEntriesTo(StorageContext context, List<ContainerEntry> entries, OutputStream out) throws IOException { TarArchiveOutputStream tar = new TarArchiveOutputStream(out); String checksumEntry = container.getChecksumEntry(); if (checksumEntry != null) { ByteArrayOutputStream bout = new ByteArrayOutputStream(); ContainerEntry.writeChecksumsTo(entries, bout); TarArchiveEntry tarEntry = new TarArchiveEntry(checksumEntry); tarEntry.setSize(bout.size());/* w w w. j a v a 2 s . c om*/ tar.putArchiveEntry(tarEntry); tar.write(bout.toByteArray()); tar.closeArchiveEntry(); } for (ContainerEntry entry : entries) { Path path = entry.getSourcePath(); TarArchiveEntry tarEntry = new TarArchiveEntry(entry.getName()); tarEntry.setModTime(Files.getLastModifiedTime(path).toMillis()); tarEntry.setSize(Files.size(path)); tar.putArchiveEntry(tarEntry); Files.copy(path, tar); tar.closeArchiveEntry(); } tar.finish(); }
From source file:org.dspace.pack.bagit.Bag.java
private void fillArchive(File dirFile, String relBase, ArchiveOutputStream out) throws IOException { for (File file : dirFile.listFiles()) { String relPath = relBase + File.separator + file.getName(); if (file.isDirectory()) { fillArchive(file, relPath, out); } else {//from w w w . jav a2 s .c o m TarArchiveEntry entry = new TarArchiveEntry(relPath); entry.setSize(file.length()); entry.setModTime(0L); out.putArchiveEntry(entry); FileInputStream fin = new FileInputStream(file); Utils.copy(fin, out); out.closeArchiveEntry(); fin.close(); } } }
From source file:org.eclipse.che.api.vfs.TarArchiver.java
private void addTarEntry(VirtualFile virtualFile, TarArchiveOutputStream tarOutputStream) throws ServerException { try {/*from w w w . j a v a 2s . c o m*/ TarArchiveEntry tarEntry = new TarArchiveEntry(getTarEntryName(virtualFile)); if (virtualFile.isFolder()) { tarEntry.setModTime(0); tarOutputStream.putArchiveEntry(tarEntry); } else { tarEntry.setSize(virtualFile.getLength()); tarEntry.setModTime(virtualFile.getLastModificationDate()); tarOutputStream.putArchiveEntry(tarEntry); try (InputStream content = virtualFile.getContent()) { ByteStreams.copy(content, tarOutputStream); } } tarOutputStream.closeArchiveEntry(); } catch (ForbiddenException e) { throw new ServerException(e.getServiceError()); } catch (IOException e) { throw new ServerException(e.getMessage(), e); } }
From source file:org.eclipse.che.api.vfs.TarArchiverTest.java
private byte[] createTestTarArchive() throws IOException { ByteArrayOutputStream byteOut = new ByteArrayOutputStream(); TarArchiveOutputStream tarOut = new TarArchiveOutputStream(byteOut); addDirectoryEntry(tarOut, new TarArchiveEntry("arc/")); addDirectoryEntry(tarOut, new TarArchiveEntry("arc/a/")); addFileEntry(tarOut, "arc/a/_a.txt"); addDirectoryEntry(tarOut, new TarArchiveEntry("arc/b/")); addFileEntry(tarOut, "arc/b/_b.txt"); addDirectoryEntry(tarOut, new TarArchiveEntry("arc/c/")); addFileEntry(tarOut, "arc/c/_c.txt"); tarOut.close();//from ww w . java2 s.c om return byteOut.toByteArray(); }
From source file:org.eclipse.che.api.vfs.TarArchiverTest.java
private void addFileEntry(TarArchiveOutputStream tarOut, String name) throws IOException { TarArchiveEntry entryA = new TarArchiveEntry(name); entryA.setSize(TEST_CONTENT_BYTES.length); tarOut.putArchiveEntry(entryA);//w w w.j a v a 2 s. c o m tarOut.write(TEST_CONTENT_BYTES); tarOut.closeArchiveEntry(); }