Example usage for org.apache.commons.compress.archivers.tar TarArchiveEntry TarArchiveEntry

List of usage examples for org.apache.commons.compress.archivers.tar TarArchiveEntry TarArchiveEntry

Introduction

In this page you can find the example usage for org.apache.commons.compress.archivers.tar TarArchiveEntry TarArchiveEntry.

Prototype

public TarArchiveEntry(byte[] headerBuf) 

Source Link

Document

Construct an entry from an archive's header bytes.

Usage

From source file:org.apache.nutch.tools.CommonCrawlDataDumper.java

/**
 * Dumps the reverse engineered CBOR content from the provided segment
 * directories if a parent directory contains more than one segment,
 * otherwise a single segment can be passed as an argument. If the boolean
 * argument is provided then the CBOR is also zipped.
 *
 * @param outputDir      the directory you wish to dump the raw content to. This
 *                       directory will be created.
 * @param segmentRootDir a directory containing one or more segments.
 * @param linkdb         Path to linkdb.
 * @param gzip           a boolean flag indicating whether the CBOR content should also
 *                       be gzipped./*from w ww.  j av a 2  s. com*/
 * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
 * @param extension      a file extension to use with output documents.
 * @throws Exception if any exception occurs.
 */
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes,
        boolean epochFilename, String extension, boolean warc) throws Exception {
    if (gzip) {
        LOG.info("Gzipping CBOR data has been skipped");
    }
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counters
    Map<String, Integer> filteredCounts = new HashMap<>();

    Configuration nutchConfig = NutchConfiguration.create();
    Path segmentRootPath = new Path(segmentRootDir.toString());
    FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);

    //get all paths
    List<Path> parts = new ArrayList<>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}"
            + File.separator + "data";
    while (files.hasNext()) {
        LocatedFileStatus next = files.next();
        if (next.isFile()) {
            Path path = next.getPath();
            if (path.toString().matches(partPattern)) {
                parts.add(path);
            }
        }
    }

    LinkDbReader linkDbReader = null;
    if (linkdb != null) {
        linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
    }
    if (parts == null || parts.size() == 0) {
        LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
        System.exit(1);
    }
    LOG.info("Found {} segment parts", parts.size());
    if (gzip && !warc) {
        fileList = new ArrayList<>();
        constructNewStream(outputDir);
    }

    for (Path segmentPart : parts) {
        LOG.info("Processing segment Part : [ {} ]", segmentPart);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig,
                    SequenceFile.Reader.file(segmentPart));

            Writable key = (Writable) reader.getKeyClass().newInstance();

            Content content = null;
            while (reader.next(key)) {
                content = new Content();
                reader.getCurrentValue(content);
                Metadata metadata = content.getMetadata();
                String url = key.toString();

                String baseName = FilenameUtils.getBaseName(url);
                String extensionName = FilenameUtils.getExtension(url);

                if (!extension.isEmpty()) {
                    extensionName = extension;
                } else if ((extensionName == null) || extensionName.isEmpty()) {
                    extensionName = "html";
                }

                String outputFullPath = null;
                String outputRelativePath = null;
                String filename = null;
                String timestamp = null;
                String reverseKey = null;

                if (epochFilename || config.getReverseKey()) {
                    try {
                        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z")
                                .parse(getDate(metadata.get("Date"))).getTime();
                        timestamp = String.valueOf(epoch);
                    } catch (ParseException pe) {
                        LOG.warn(pe.getMessage());
                    }

                    reverseKey = reverseUrl(url);
                    config.setReverseKeyValue(
                            reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
                }

                if (!warc) {
                    if (epochFilename) {
                        outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(),
                                reverseKey, url, timestamp, extensionName, !gzip);
                        outputRelativePath = outputFullPath.substring(0,
                                outputFullPath.lastIndexOf(File.separator) - 1);
                        filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                    } else {
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(),
                                md5Ofurl, !gzip);
                        filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                        outputFullPath = String.format("%s/%s", fullDir, filename);

                        String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                        String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                        String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                        outputRelativePath = firstLevelDirName + secondLevelDirName;
                    }
                }
                // Encode all filetypes if no mimetypes have been given
                Boolean filter = (mimeTypes == null);

                String jsonData = "";
                try {
                    String mimeType = new Tika().detect(content.getContent());
                    // Maps file to JSON-based structure

                    Set<String> inUrls = null; //there may be duplicates, so using set
                    if (linkDbReader != null) {
                        Inlinks inlinks = linkDbReader.getInlinks((Text) key);
                        if (inlinks != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<>();
                            while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                    }
                    //TODO: Make this Jackson Format implementation reusable
                    try (CommonCrawlFormat format = CommonCrawlFormatFactory
                            .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
                        if (inUrls != null) {
                            format.setInLinks(new ArrayList<>(inUrls));
                        }
                        jsonData = format.getJsonData(url, content, metadata);
                    }

                    collectStats(typeCounts, mimeType);
                    // collects statistics for the given mimetypes
                    if ((mimeType != null) && (mimeTypes != null)
                            && Arrays.asList(mimeTypes).contains(mimeType)) {
                        collectStats(filteredCounts, mimeType);
                        filter = true;
                    }
                } catch (IOException ioe) {
                    LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                    return;
                }

                if (!warc) {
                    if (filter) {
                        byte[] byteData = serializeCBORData(jsonData);

                        if (!gzip) {
                            File outputFile = new File(outputFullPath);
                            if (outputFile.exists()) {
                                LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            } else {
                                LOG.info("Writing: [" + outputFullPath + "]");
                                IOUtils.copy(new ByteArrayInputStream(byteData),
                                        new FileOutputStream(outputFile));
                            }
                        } else {
                            if (fileList.contains(outputFullPath)) {
                                LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                            } else {
                                fileList.add(outputFullPath);
                                LOG.info("Compressing: [" + outputFullPath + "]");
                                //TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                                TarArchiveEntry tarEntry = new TarArchiveEntry(
                                        outputRelativePath + File.separator + filename);
                                tarEntry.setSize(byteData.length);
                                tarOutput.putArchiveEntry(tarEntry);
                                tarOutput.write(byteData);
                                tarOutput.closeArchiveEntry();
                            }
                        }
                    }
                }
            }
            reader.close();
        } catch (Exception e) {
            LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
        } finally {
            fs.close();
        }
    }

    if (gzip && !warc) {
        closeStream();
    }

    if (!typeCounts.isEmpty()) {
        LOG.info("CommonsCrawlDataDumper File Stats: "
                + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }

}

From source file:org.apache.openaz.xacml.admin.components.PolicyWorkspace.java

@Override
public InputStream getStream() {
    ///*  www .j a v  a2  s .  c  o m*/
    // Grab our working repository
    //
    final Path repoPath = ((XacmlAdminUI) getUI()).getUserGitPath();
    Path workspacePath = ((XacmlAdminUI) getUI()).getUserWorkspace();
    final Path tarFile = Paths.get(workspacePath.toString(), "Repository.tgz");

    try (OutputStream os = Files.newOutputStream(tarFile)) {
        try (GzipCompressorOutputStream gzOut = new GzipCompressorOutputStream(os)) {
            try (TarArchiveOutputStream tarOut = new TarArchiveOutputStream(gzOut)) {

                tarOut.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);

                Files.walkFileTree(repoPath, new SimpleFileVisitor<Path>() {

                    @Override
                    public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs)
                            throws IOException {
                        if (dir.getFileName().toString().startsWith(".git")) {
                            return FileVisitResult.SKIP_SUBTREE;
                        }
                        Path relative = repoPath.relativize(dir);
                        if (relative.toString().isEmpty()) {
                            return super.preVisitDirectory(dir, attrs);
                        }
                        TarArchiveEntry entry = new TarArchiveEntry(relative.toFile());
                        tarOut.putArchiveEntry(entry);
                        tarOut.closeArchiveEntry();
                        return super.preVisitDirectory(dir, attrs);
                    }

                    @Override
                    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                        if (file.getFileName().toString().endsWith(".xml") == false) {
                            return super.visitFile(file, attrs);
                        }
                        Path relative = repoPath.relativize(file);
                        TarArchiveEntry entry = new TarArchiveEntry(relative.toFile());
                        entry.setSize(Files.size(file));
                        tarOut.putArchiveEntry(entry);
                        try {
                            IOUtils.copy(Files.newInputStream(file), tarOut);
                        } catch (IOException e) {
                            logger.error(e);
                        }
                        tarOut.closeArchiveEntry();
                        return super.visitFile(file, attrs);
                    }

                });
                tarOut.finish();
            }
        }
    } catch (IOException e) {
        logger.error(e);
    }
    try {
        return Files.newInputStream(tarFile);
    } catch (IOException e) {
        logger.error(e);
    }
    return null;
}

From source file:org.apache.tika.server.TarWriter.java

private static void tarStoreBuffer(TarArchiveOutputStream zip, String name, byte[] dataBuffer)
        throws IOException {
    TarArchiveEntry entry = new TarArchiveEntry(name);

    entry.setSize(dataBuffer.length);//from ww w.  j a v a  2s  . co  m

    zip.putArchiveEntry(entry);

    zip.write(dataBuffer);

    zip.closeArchiveEntry();
}

From source file:org.artifactory.util.ArchiveUtils.java

/**
 * Use for writing streams - must specify file size in advance as well
 *//* w  ww .j av  a  2  s  .co m*/
public static ArchiveEntry createArchiveEntry(String relativePath, ArchiveType archiveType, long size) {
    switch (archiveType) {
    case ZIP:
        ZipArchiveEntry zipEntry = new ZipArchiveEntry(relativePath);
        zipEntry.setSize(size);
        return zipEntry;
    case TAR:
    case TARGZ:
    case TGZ:
        TarArchiveEntry tarEntry = new TarArchiveEntry(relativePath);
        tarEntry.setSize(size);
        return tarEntry;
    }
    throw new IllegalArgumentException("Unsupported archive type: '" + archiveType + "'");
}

From source file:org.codehaus.plexus.archiver.tar.TarRoundTripTest.java

/**
 * test round-tripping long (GNU) entries
 *///from  w  w w  .  j  a  va 2s  .  c o  m
public void testLongRoundTripping() throws IOException {
    TarArchiveEntry original = new TarArchiveEntry(LONG_NAME);
    assertEquals("over 100 chars", true, LONG_NAME.length() > 100);
    assertEquals("original name", LONG_NAME, original.getName());

    ByteArrayOutputStream buff = new ByteArrayOutputStream();
    TarArchiveOutputStream tos = new TarArchiveOutputStream(buff);
    tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
    tos.putArchiveEntry(original);
    tos.closeArchiveEntry();
    tos.close();

    TarArchiveInputStream tis = new TarArchiveInputStream(new ByteArrayInputStream(buff.toByteArray()));
    TarArchiveEntry tripped = tis.getNextTarEntry();
    assertEquals("round-tripped name", LONG_NAME, tripped.getName());
    assertNull("no more entries", tis.getNextEntry());
    tis.close();
}

From source file:org.dcm4chee.storage.tar.TarContainerProvider.java

@Override
public void writeEntriesTo(StorageContext context, List<ContainerEntry> entries, OutputStream out)
        throws IOException {
    TarArchiveOutputStream tar = new TarArchiveOutputStream(out);
    String checksumEntry = container.getChecksumEntry();
    if (checksumEntry != null) {
        ByteArrayOutputStream bout = new ByteArrayOutputStream();
        ContainerEntry.writeChecksumsTo(entries, bout);
        TarArchiveEntry tarEntry = new TarArchiveEntry(checksumEntry);
        tarEntry.setSize(bout.size());/* w  w w.  j  a v a  2 s  .  c  om*/
        tar.putArchiveEntry(tarEntry);
        tar.write(bout.toByteArray());
        tar.closeArchiveEntry();
    }
    for (ContainerEntry entry : entries) {
        Path path = entry.getSourcePath();
        TarArchiveEntry tarEntry = new TarArchiveEntry(entry.getName());
        tarEntry.setModTime(Files.getLastModifiedTime(path).toMillis());
        tarEntry.setSize(Files.size(path));
        tar.putArchiveEntry(tarEntry);
        Files.copy(path, tar);
        tar.closeArchiveEntry();
    }
    tar.finish();
}

From source file:org.dspace.pack.bagit.Bag.java

private void fillArchive(File dirFile, String relBase, ArchiveOutputStream out) throws IOException {
    for (File file : dirFile.listFiles()) {
        String relPath = relBase + File.separator + file.getName();
        if (file.isDirectory()) {
            fillArchive(file, relPath, out);
        } else {//from w w w .  jav a2  s .c  o m
            TarArchiveEntry entry = new TarArchiveEntry(relPath);
            entry.setSize(file.length());
            entry.setModTime(0L);
            out.putArchiveEntry(entry);
            FileInputStream fin = new FileInputStream(file);
            Utils.copy(fin, out);
            out.closeArchiveEntry();
            fin.close();
        }
    }
}

From source file:org.eclipse.che.api.vfs.TarArchiver.java

private void addTarEntry(VirtualFile virtualFile, TarArchiveOutputStream tarOutputStream)
        throws ServerException {
    try {/*from w w  w  .  j  a v  a 2s . c  o  m*/
        TarArchiveEntry tarEntry = new TarArchiveEntry(getTarEntryName(virtualFile));
        if (virtualFile.isFolder()) {
            tarEntry.setModTime(0);
            tarOutputStream.putArchiveEntry(tarEntry);
        } else {
            tarEntry.setSize(virtualFile.getLength());
            tarEntry.setModTime(virtualFile.getLastModificationDate());
            tarOutputStream.putArchiveEntry(tarEntry);
            try (InputStream content = virtualFile.getContent()) {
                ByteStreams.copy(content, tarOutputStream);
            }
        }
        tarOutputStream.closeArchiveEntry();
    } catch (ForbiddenException e) {
        throw new ServerException(e.getServiceError());
    } catch (IOException e) {
        throw new ServerException(e.getMessage(), e);
    }
}

From source file:org.eclipse.che.api.vfs.TarArchiverTest.java

private byte[] createTestTarArchive() throws IOException {
    ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
    TarArchiveOutputStream tarOut = new TarArchiveOutputStream(byteOut);
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/"));
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/a/"));
    addFileEntry(tarOut, "arc/a/_a.txt");
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/b/"));
    addFileEntry(tarOut, "arc/b/_b.txt");
    addDirectoryEntry(tarOut, new TarArchiveEntry("arc/c/"));
    addFileEntry(tarOut, "arc/c/_c.txt");
    tarOut.close();//from ww  w .  java2  s.c  om
    return byteOut.toByteArray();
}

From source file:org.eclipse.che.api.vfs.TarArchiverTest.java

private void addFileEntry(TarArchiveOutputStream tarOut, String name) throws IOException {
    TarArchiveEntry entryA = new TarArchiveEntry(name);
    entryA.setSize(TEST_CONTENT_BYTES.length);
    tarOut.putArchiveEntry(entryA);//w w w.j  a v a  2  s.  c  o  m
    tarOut.write(TEST_CONTENT_BYTES);
    tarOut.closeArchiveEntry();
}