org.opensextant.xtext.collectors.ArchiveNavigator.java Source code

Java tutorial

Introduction

Here is the source code for org.opensextant.xtext.collectors.ArchiveNavigator.java

Source

/**
 *
 *      Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * **************************************************************************
 *                          NOTICE
 * This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 */
package org.opensextant.xtext.collectors;

import java.io.*;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.zip.*;
import org.apache.commons.compress.archivers.tar.*;
import org.apache.commons.compress.utils.IOUtils;
import org.opensextant.ConfigException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.opensextant.xtext.ExclusionFilter;
import org.opensextant.xtext.Converter;

/**
 * Archive is traversed, but no data is written to disk unless XText is in save
 * mode. Conversion listener should be listening for Converted Docs.
 *
 * @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
 */
public class ArchiveNavigator implements Collector {

    private final Logger log = LoggerFactory.getLogger(ArchiveNavigator.class);
    private File saveDir = null;
    private ExclusionFilter filter = null;
    private Converter converter = null;
    public boolean overwrite = false;

    /**
     * Given a working temp folder and a file filter unpack archives.  Teh working dir, saveTo, is not created.
     * It must exist ahead of time;
     *
     * @param inputFile  input archive
     * @param saveTo output dir where entries are saved.
     * @param fileFilter  file exension filter
     * @param fileConv  conversion resource, e.g. instance of XText
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public ArchiveNavigator(File inputFile, String saveTo, ExclusionFilter fileFilter, Converter fileConv)
            throws IOException {
        this.saveDir = new File(saveTo);
        filter = fileFilter;
        converter = fileConv;

        if (filter == null || converter == null) {
            throw new IOException("Filter and converter cannot be null -- XText is the default for both.");
        }

        currentArchive = inputFile;
    }

    public String getWorkingDir() {
        return saveDir.getAbsolutePath();
    }

    private File currentArchive = null;

    /**
     * Unpack any archive. You must provide a converter -- which converts each
     * file.
     * @throws ConfigException if archive output dirs were requested but unsettable or non-existant
     * @throws IOException if archive had I/O issues or is invalid type of archive
     */
    @Override
    public void collect() throws IOException, ConfigException {
        // Get file extension
        String ext = FilenameUtils.getExtension(currentArchive.getPath());

        File archivetmp = null;

        if (ext.equalsIgnoreCase("zip")) {
            archivetmp = unzip(currentArchive);
        } else if (ext.equalsIgnoreCase("tar")) {
            archivetmp = untar(currentArchive);
        } else if (ext.equalsIgnoreCase("gz") || ext.equalsIgnoreCase("tgz") || ext.equalsIgnoreCase("tar.gz")) {
            String basename = FilenameUtils.getBaseName(currentArchive.getName());
            // We assume the file is a tarball. First unzip it
            File tarFile = gunzipAsTAR(currentArchive, basename);

            // Then untar it
            archivetmp = untar(tarFile);
        } else {
            throw new IOException("Unsupported archive type: EXT=" + ext);
        }
        log.info("Archive FILE={} has been processed to DIR={}", currentArchive, archivetmp);
    }

    /*
     * Un-TAR. Oops. Its just a copy of Un-TAR and I replace tar with zip.
     *
     * so there may be Zip-specific stuff here, ... but the approach is the
     * same.
     */
    public File unzip(File zipFile) throws IOException, ConfigException {

        // String _working = FilenameUtils.concat(getWorkingDir(),
        // FilenameUtils.getBaseName(zipFile.getPath()));
        // if (_working == null){
        // throw new IOException("Invalid archive path for "+zipFile.getPath());
        // }

        // File workingDir = new File(_working);
        // workingDir.mkdir();
        File workingDir = saveDir;

        InputStream input = new BufferedInputStream(new FileInputStream(zipFile));
        ZipArchiveInputStream in = null;
        try {
            in = (ZipArchiveInputStream) (new ArchiveStreamFactory().createArchiveInputStream("zip", input));

            ZipArchiveEntry zipEntry;
            while ((zipEntry = (ZipArchiveEntry) in.getNextEntry()) != null) {
                if (filterEntry(zipEntry)) {
                    continue;
                }

                try {
                    File tmpFile = saveArchiveEntry(zipEntry, in, workingDir);
                    converter.convert(tmpFile);

                } catch (IOException err) {
                    log.error("Unable to save item, FILE=" + zipEntry.getName() + "!" + zipEntry.getName(), err);
                }
            }
            return workingDir;

        } catch (ArchiveException ae) {
            throw new IOException(ae);
        } finally {
            in.close();
        }
    }

    /**
     *
     * @param theFile
     * @param fname
     * @return TAR file path for result.
     * @throws IOException on I/O failure
     */
    private File gunzipAsTAR(File theFile, String fname) throws IOException {

        GZIPInputStream gzipInputStream = null;
        OutputStream out = null;

        try {
            gzipInputStream = new GZIPInputStream(new FileInputStream(theFile));
            // TODO:  more testing on this particular case:  gunzip *.gz *.tgz *.tar.gz -- a mix of tar and gunzip
            String outFilename = getWorkingDir() + '/' + fname + ".tar";
            File outFile = new File(outFilename);
            out = new BufferedOutputStream(new FileOutputStream(outFilename));

            byte[] buf = new byte[1024];
            int len;
            while ((len = gzipInputStream.read(buf)) > 0) {
                out.write(buf, 0, len);
            }

            return outFile;
        } finally {
            gzipInputStream.close();
            if (out != null) {
                out.close();
            }
        }
    }

    /*
     * Un-TAR Once items are saved off to temp folder, they'll be converted by
     * the file converter. The converter can choose to do something else with
     * them.
     */
    public File untar(File tarFile) throws IOException, ConfigException {

        String _working = FilenameUtils.concat(getWorkingDir(), FilenameUtils.getBaseName(tarFile.getPath()));
        if (_working == null) {
            throw new IOException("Invalid archive path for " + tarFile.getPath());
        }
        File workingDir = new File(_working);
        workingDir.mkdir();

        InputStream input = new BufferedInputStream(new FileInputStream(tarFile));
        TarArchiveInputStream in = null;
        try {
            in = (TarArchiveInputStream) (new ArchiveStreamFactory().createArchiveInputStream("tar", input));

            TarArchiveEntry tarEntry;
            while ((tarEntry = (TarArchiveEntry) in.getNextEntry()) != null) {
                if (filterEntry(tarEntry)) {
                    continue;
                }

                try {
                    File tmpFile = saveArchiveEntry(tarEntry, in, _working);
                    converter.convert(tmpFile);
                } catch (IOException err) {
                    log.error("Unable to save item, FILE=" + tarFile.getName() + "!" + tarEntry.getName(), err);
                }
            }
        } catch (ArchiveException ae) {
            throw new IOException(ae);
        } finally {
            in.close();
        }
        return workingDir;
    }

    /**
     * save to root dir
     *
     * @param E
     * @param archiveio
     * @param root
     * @return
     * @throws IOException if entry could not be saved to disk, e.g., outputDir
     */
    private File saveArchiveEntry(ArchiveEntry E, InputStream archiveio, File root) throws IOException {
        return saveArchiveEntry(E, archiveio, root.getAbsolutePath());
    }

    /** */
    private File saveArchiveEntry(ArchiveEntry E, InputStream archiveio, String root) throws IOException {

        // Note: using native OS file path is fine here.  As long as you do not
        // try any string mechanics on paths.
        //
        String targetPath = FilenameUtils.concat(root, E.getName());
        if (targetPath == null) {
            throw new IOException("Invalid archive entry target for " + E.getName());
        }
        File target = new File(targetPath);
        if (target.exists() && !overwrite) {
            return target;
        }

        target.getParentFile().mkdirs();
        log.debug("ARCHIVE_ENTRY={}", E.getName());
        OutputStream output = null;
        try {
            output = new FileOutputStream(target);
            IOUtils.copy(archiveio, output);
        } finally {
            output.close();
        }
        return target;
    }

    private boolean filterEntry(ArchiveEntry E) {
        if (E.isDirectory()) {
            return true;
        }
        if (filter.filterOutFile(E.getName())) {
            return true;
        }
        return false;
    }

    @Override
    public String getName() {
        // TODO Auto-generated method stub
        return null;
    }
}