Example usage for org.apache.commons.compress.archivers.tar TarArchiveEntry TarArchiveEntry

Introduction

In this page you can find the example usage for org.apache.commons.compress.archivers.tar TarArchiveEntry TarArchiveEntry.

Prototype

public TarArchiveEntry(byte[] headerBuf)

Source Link

Document

Construct an entry from an archive's header bytes.

Usage

From source file:org.apache.nutch.tools.CommonCrawlDataDumper.java

/**
 * Dumps the reverse engineered CBOR content from the provided segment
 * directories if a parent directory contains more than one segment,
 * otherwise a single segment can be passed as an argument. If the boolean
 * argument is provided then the CBOR is also zipped.
 *
 * @param outputDir      the directory you wish to dump the raw content to. This
 *                       directory will be created.
 * @param segmentRootDir a directory containing one or more segments.
 * @param linkdb         Path to linkdb.
 * @param gzip           a boolean flag indicating whether the CBOR content should also
 *                       be gzipped./*from w ww.  j av a 2  s. com*/
 * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
 * @param extension      a file extension to use with output documents.
 * @throws Exception if any exception occurs.
 */
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes,
        boolean epochFilename, String extension, boolean warc) throws Exception {
    if (gzip) {
        LOG.info("Gzipping CBOR data has been skipped");
    }
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counters
    Map<String, Integer> filteredCounts = new HashMap<>();

    Configuration nutchConfig = NutchConfiguration.create();
    Path segmentRootPath = new Path(segmentRootDir.toString());
    FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);

    //get all paths
    List<Path> parts = new ArrayList<>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}"
            + File.separator + "data";
    while (files.hasNext()) {
        LocatedFileStatus next = files.next();
        if (next.isFile()) {
            Path path = next.getPath();
            if (path.toString().matches(partPattern)) {
                parts.add(path);
            }
        }
    }

    LinkDbReader linkDbReader = null;
    if (linkdb != null) {
        linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
    }
    if (parts == null || parts.size() == 0) {
        LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
        System.exit(1);
    }
    LOG.info("Found {} segment parts", parts.size());
    if (gzip && !warc) {
        fileList = new ArrayList<>();
        constructNewStream(outputDir);
    }

    for (Path segmentPart : parts) {
        LOG.info("Processing segment Part : [ {} ]", segmentPart);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig,
                    SequenceFile.Reader.file(segmentPart));

            Writable key = (Writable) reader.getKeyClass().newInstance();

            Content content = null;
            while (reader.next(key)) {
                content = new Content();
                reader.getCurrentValue(content);
                Metadata metadata = content.getMetadata();
                String url = key.toString();

                String baseName = FilenameUtils.getBaseName(url);
                String extensionName = FilenameUtils.getExtension(url);

                if (!extension.isEmpty()) {
                    extensionName = extension;
                } else if ((extensionName == null) || extensionName.isEmpty()) {
                    extensionName = "html";
                }

                String outputFullPath = null;
                String outputRelativePath = null;
                String filename = null;
                String timestamp = null;
                String reverseKey = null;

                if (epochFilename || config.getReverseKey()) {
                    try {
                        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z")
                                .parse(getDate(metadata.get("Date"))).getTime();
                        timestamp = String.valueOf(epoch);
                    } catch (ParseException pe) {
                        LOG.warn(pe.getMessage());
                    }

                    reverseKey = reverseUrl(url);
                    config.setReverseKeyValue(
                            reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
                }

                if (!warc) {
                    if (epochFilename) {
                        outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(),
                                reverseKey, url, timestamp, extensionName, !gzip);
                        outputRelativePath = outputFullPath.substring(0,
                                outputFullPath.lastIndexOf(File.separator) - 1);
                        filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                    } else {
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(),
                                md5Ofurl, !gzip);
                        filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                        outputFullPath = String.format("%s/%s", fullDir, filename);

                        String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                        String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                        String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                        outputRelativePath = firstLevelDirName + secondLevelDirName;
                    }
                }
                // Encode all filetypes if no mimetypes have been given
                Boolean filter = (mimeTypes == null);

                String jsonData = "";
                try {
                    String mimeType = new Tika().detect(content.getContent());
                    // Maps file to JSON-based structure

                    Set<String> inUrls = null; //there may be duplicates, so using set
                    if (linkDbReader != null) {
                        Inlinks inlinks = linkDbReader.getInlinks((Text) key);
                        if (inlinks != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<>();
                            while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                    }
                    //TODO: Make this Jackson Format implementation reusable
                    try (CommonCrawlFormat format = CommonCrawlFormatFactory
                            .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
                        if (inUrls != null) {
                            format.setInLinks(new ArrayList<>(inUrls));
                        }
                        jsonData = format.getJsonData(url, content, metadata);
                    }

                    collectStats(typeCounts, mimeType);
                    // collects statistics for the given mimetypes
                    if ((mimeType != null) && (mimeTypes != null)
                            && Arrays.asList(mimeTypes).contains(mimeType)) {
                        collectStats(filteredCounts, mimeType);
                        filter = true;
                    }
                } catch (IOException ioe) {
                    LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                    return;
                }

                if (!warc) {
                    if (filter) {
                        byte[] byteData = serializeCBORData(jsonData);

                        if (!gzip) {
                            File outputFile = new File(outputFullPath);
                            if (outputFile.exists()) {
                                LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            } else {
                                LOG.info("Writing: [" + outputFullPath + "]");
                                IOUtils.copy(new ByteArrayInputStream(byteData),
                                        new FileOutputStream(outputFile));
                            }
                        } else {
                            if (fileList.contains(outputFullPath)) {
                                LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                            } else {
                                fileList.add(outputFullPath);
                                LOG.info("Compressing: [" + outputFullPath + "]");
                                //TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                                TarArchiveEntry tarEntry = new TarArchiveEntry(
                                        outputRelativePath + File.separator + filename);
                                tarEntry.setSize(byteData.length);
                                tarOutput.putArchiveEntry(tarEntry);
                                tarOutput.write(byteData);
                                tarOutput.closeArchiveEntry();
                            }
                        }
                    }
                }
            }
            reader.close();
        } catch (Exception e) {
            LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
        } finally {
            fs.close();
        }
    }

    if (gzip && !warc) {
        closeStream();
    }

    if (!typeCounts.isEmpty()) {
        LOG.info("CommonsCrawlDataDumper File Stats: "
                + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }

}

From source file:org.apache.openaz.xacml.admin.components.PolicyWorkspace.java

@Override
public InputStream getStream() {
    ///*  www .j a v  a2  s .  c  o m*/
    // Grab our working repository
    //
    final Path repoPath = ((XacmlAdminUI) getUI()).getUserGitPath();
    Path workspacePath = ((XacmlAdminUI) getUI()).getUserWorkspace();
    final Path tarFile = Paths.get(workspacePath.toString(), "Repository.tgz");

    try (OutputStream os = Files.newOutputStream(tarFile)) {
        try (GzipCompressorOutputStream gzOut = new GzipCompressorOutputStream(os)) {
            try (TarArchiveOutputStream tarOut = new TarArchiveOutputStream(gzOut)) {

                tarOut.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);

                Files.walkFileTree(repoPath, new SimpleFileVisitor<Path>() {

                    @Override
                    public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs)
                            throws IOException {
                        if (dir.getFileName().toString().startsWith(".git")) {
                            return FileVisitResult.SKIP_SUBTREE;
                        }
                        Path relative = repoPath.relativize(dir);
                        if (relative.toString().isEmpty()) {
                            return super.preVisitDirectory(dir, attrs);
                        }
                        TarArchiveEntry entry = new TarArchiveEntry(relative.toFile());
                        tarOut.putArchiveEntry(entry);
                        tarOut.closeArchiveEntry();
                        return super.preVisitDirectory(dir, attrs);
                    }

                    @Override
                    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                        if (file.getFileName().toString().endsWith(".xml") == false) {
                            return super.visitFile(file, attrs);
                        }
                        Path relative = repoPath.relativize(file);
                        TarArchiveEntry entry = new TarArchiveEntry(relative.toFile());
                        entry.setSize(Files.size(file));
                        tarOut.putArchiveEntry(entry);
                        try {
                            IOUtils.copy(Files.newInputStream(file), tarOut);
                        } catch (IOException e) {
                            logger.error(e);
                        }
                        tarOut.closeArchiveEntry();
                        return super.visitFile(file, attrs);
                    }

                });
                tarOut.finish();
            }
        }
    } catch (IOException e) {
        logger.error(e);
    }
    try {
        return Files.newInputStream(tarFile);
    } catch (IOException e) {
        logger.error(e);
    }
    return null;
}

From source file:org.apache.tika.server.TarWriter.java

private static void tarStoreBuffer(TarArchiveOutputStream zip, String name, byte[] dataBuffer)
        throws IOException {
    TarArchiveEntry entry = new TarArchiveEntry(name);

    entry.setSize(dataBuffer.length);//from ww w.  j a v a  2s  . co  m

    zip.putArchiveEntry(entry);

    zip.write(dataBuffer);

    zip.closeArchiveEntry();
}

From source file:org.artifactory.util.ArchiveUtils.java

/**
 * Use for writing streams - must specify file size in advance as well
 *//* w  ww .j av  a  2  s  .co m*/
public static ArchiveEntry createArchiveEntry(String relativePath, ArchiveType archiveType, long size) {
    switch (archiveType) {
    case ZIP:
        ZipArchiveEntry zipEntry = new ZipArchiveEntry(relativePath);
        zipEntry.setSize(size);
        return zipEntry;
    case TAR:
    case TARGZ:
    case TGZ:
        TarArchiveEntry tarEntry = new TarArchiveEntry(relativePath);
        tarEntry.setSize(size);
        return tarEntry;
    }
    throw new IllegalArgumentException("Unsupported archive type: '" + archiveType + "'");
}

From source file:org.codehaus.plexus.archiver.tar.TarRoundTripTest.java

/**
 * test round-tripping long (GNU) entries
 *///from  w  w w  .  j  a  va 2s  .  c o  m
public void testLongRoundTripping() throws IOException {
    TarArchiveEntry original = new TarArchiveEntry(LONG_NAME);
    assertEquals("over 100 chars", true, LONG_NAME.length() > 100);
    assertEquals("original name", LONG_NAME, original.getName());

    ByteArrayOutputStream buff = new ByteArrayOutputStream();
    TarArchiveOutputStream tos = new TarArchiveOutputStream(buff);
    tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
    tos.putArchiveEntry(original);
    tos.closeArchiveEntry();
    tos.close();

    TarArchiveInputStream tis = new TarArchiveInputStream(new ByteArrayInputStream(buff.toByteArray()));
    TarArchiveEntry tripped = tis.getNextTarEntry();
    assertEquals("round-tripped name", LONG_NAME, tripped.getName());
    assertNull("no more entries", tis.getNextEntry());
    tis.close();
}

From source file:org.dcm4chee.storage.tar.TarContainerProvider.java

@Override
public void writeEntriesTo(StorageContext context, List<ContainerEntry> entries, OutputStream out)
        throws IOException {
    TarArchiveOutputStream tar = new TarArchiveOutputStream(out);
    String checksumEntry = container.getChecksumEntry();
    if (checksumEntry != null) {
        ByteArrayOutputStream bout = new ByteArrayOutputStream();
        ContainerEntry.writeChecksumsTo(entries, bout);
        TarArchiveEntry tarEntry = new TarArchiveEntry(checksumEntry);
        tarEntry.setSize(bout.size());/* w  w w.  j  a v a  2 s  .  c  om*/
        tar.putArchiveEntry(tarEntry);
        tar.write(bout.toByteArray());
        tar.closeArchiveEntry();
    }
    for (ContainerEntry entry : entries) {
        Path path = entry.getSourcePath();
        TarArchiveEntry tarEntry = new TarArchiveEntry(entry.getName());
        tarEntry.setModTime(Files.getLastModifiedTime(path).toMillis());
        tarEntry.setSize(Files.size(path));
        tar.putArchiveEntry(tarEntry);
        Files.copy(path, tar);
        tar.closeArchiveEntry();
    }
    tar.finish();
}

From source file:org.dspace.pack.bagit.Bag.java

private void fillArchive(File dirFile, String relBase, ArchiveOutputStream out) throws IOException {
    for (File file : dirFile.listFiles()) {
        String relPath = relBase + File.separator + file.getName();
        if (file.isDirectory()) {
            fillArchive(file, relPath, out);
        } else {//from w w w .  jav a2  s .c  o m
            TarArchiveEntry entry = new TarArchiveEntry(relPath);
            entry.setSize(file.length());
            entry.setModTime(0L);
            out.putArchiveEntry(entry);
            FileInputStream fin = new FileInputStream(file);
            Utils.copy(fin, out);
            out.closeArchiveEntry();
            fin.close();
        }
    }
}

From source file:org.eclipse.che.api.vfs.TarArchiver.java

private void addTarEntry(VirtualFile virtualFile, TarArchiveOutputStream tarOutputStream)
        throws ServerException {
    try {/*from w w  w  .  j  a v  a 2s . c  o  m*/
        TarArchiveEntry tarEntry = new TarArchiveEntry(getTarEntryName(virtualFile));
        if (virtualFile.isFolder()) {
            tarEntry.setModTime(0);
            tarOutputStream.putArchiveEntry(tarEntry);
        } else {
            tarEntry.setSize(virtualFile.getLength());
            tarEntry.setModTime(virtualFile.getLastModificationDate());
            tarOutputStream.putArchiveEntry(tarEntry);
            try (InputStream content = virtualFile.getContent()) {
                ByteStreams.copy(content, tarOutputStream);
            }
        }
        tarOutputStream.closeArchiveEntry();
    } catch (ForbiddenException e) {
        throw new ServerException(e.getServiceError());
    } catch (IOException e) {
        throw new ServerException(e.getMessage(), e);
    }
}

From source file:org.eclipse.che.api.vfs.TarArchiverTest.java

private byte[] createTestTarArchive() throws IOException {
    ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
    TarArchiveOutputStream tarOut = new TarArchiveOutputStream(byteOut);
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/"));
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/a/"));
    addFileEntry(tarOut, "arc/a/_a.txt");
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/b/"));
    addFileEntry(tarOut, "arc/b/_b.txt");
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/c/"));
    addFileEntry(tarOut, "arc/c/_c.txt");
    tarOut.close();//from ww  w .  java2  s.c  om
    return byteOut.toByteArray();
}

From source file:org.eclipse.che.api.vfs.TarArchiverTest.java

private void addFileEntry(TarArchiveOutputStream tarOut, String name) throws IOException {
    TarArchiveEntry entryA = new TarArchiveEntry(name);
    entryA.setSize(TEST_CONTENT_BYTES.length);
    tarOut.putArchiveEntry(entryA);//w w w.j  a v a  2  s.  c  o  m
    tarOut.write(TEST_CONTENT_BYTES);
    tarOut.closeArchiveEntry();
}