eu.delving.sip.files.FileImporter.java Source code

Java tutorial

Introduction

Here is the source code for eu.delving.sip.files.FileImporter.java

Source

/*
 * Copyright 2011, 2012 Delving BV
 *
 *  Licensed under the EUPL, Version 1.0 or? as soon they
 *  will be approved by the European Commission - subsequent
 *  versions of the EUPL (the "Licence");
 *  you may not use this work except in compliance with the
 *  Licence.
 *  You may obtain a copy of the Licence at:
 *
 *  http://ec.europa.eu/idabc/eupl
 *
 *  Unless required by applicable law or agreed to in
 *  writing, software distributed under the Licence is
 *  distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 *  express or implied.
 *  See the Licence for the specific language governing
 *  permissions and limitations under the Licence.
 */

package eu.delving.sip.files;

import eu.delving.metadata.Hasher;
import eu.delving.sip.base.CancelException;
import eu.delving.sip.base.ProgressListener;
import eu.delving.sip.base.Work;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CountingInputStream;

import java.io.*;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import static eu.delving.metadata.StringUtil.csvDelimiter;
import static eu.delving.metadata.StringUtil.csvEscapeXML;
import static eu.delving.metadata.StringUtil.csvLineParse;
import static eu.delving.metadata.StringUtil.csvTitleToTag;
import static eu.delving.sip.files.StorageHelper.BLOCK_SIZE;
import static eu.delving.sip.files.StorageHelper.delete;
import static eu.delving.sip.files.StorageHelper.statsFile;

/**
 * Handle the importing of files into a dataset
 *
 * @author Gerald de Jong <gerald@delving.eu>
 */

public class FileImporter implements Work.DataSetWork, Work.LongTermWork {
    private static final String XML_HEADER = "<?xml";
    private File inputFile;
    private ProgressListener progressListener;
    private CountingInputStream countingInputStream;
    private Runnable finished;
    private DataSet dataSet;
    private Hasher hasher = new Hasher();

    public FileImporter(File inputFile, DataSet dataSet, Runnable finished) {
        this.inputFile = inputFile;
        this.dataSet = dataSet;
        this.finished = finished;
    }

    @Override
    public DataSet getDataSet() {
        return dataSet;
    }

    @Override
    public Job getJob() {
        return Job.IMPORT_SOURCE;
    }

    @Override
    public void setProgressListener(ProgressListener progressListener) {
        this.progressListener = progressListener;
        progressListener.setProgressMessage("Storing data");
    }

    @Override
    public void run() {
        int fileBlocks = (int) (inputFile.length() / BLOCK_SIZE);
        progressListener.prepareFor(fileBlocks);
        try {
            OutputStream outputStream = new GZIPOutputStream(new FileOutputStream(dataSet.importedOutput()));
            InputStream inputStream = new FileInputStream(inputFile);
            inputStream = countingInputStream = new CountingInputStream(inputStream);
            try {
                String name = inputFile.getName();
                if (name.endsWith(".csv")) {
                    consumeCSVFile(inputStream, outputStream);
                } else if (name.endsWith(".xml.zip")) {
                    consumeXMLZipFile(inputStream, outputStream);
                } else if (name.endsWith(".xml") || name.endsWith(".xml.gz")) {
                    if (name.endsWith(".xml.gz"))
                        inputStream = new GZIPInputStream(inputStream);
                    consumeXMLFile(inputStream, outputStream);
                } else {
                    throw new IllegalArgumentException("Unrecognized file extension: " + name);
                }
            } finally {
                IOUtils.closeQuietly(outputStream);
                IOUtils.closeQuietly(inputStream);
            }
            delete(statsFile(dataSet.importedOutput().getParentFile(), false, null));
            if (finished != null)
                finished.run();
        } catch (CancelException e) {
            delete(dataSet.importedOutput());
            progressListener.getFeedback().alert("Cancelled", e);
        } catch (IOException e) {
            delete(dataSet.importedOutput());
            progressListener.getFeedback().alert("Unable to import: " + e.getMessage(), e);
        }
    }

    private void consumeCSVFile(InputStream inputStream, OutputStream outputStream)
            throws IOException, CancelException {
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        writer.write("<csv-entries>\n");
        char delimiter = ',';
        String line;
        List<String> titles = null;
        int lineNumber = 0;
        while ((line = reader.readLine()) != null) {
            if (lineNumber == 0) {
                delimiter = csvDelimiter(line);
                titles = csvLineParse(line, delimiter);
                for (int walk = 0; walk < titles.size(); walk++) {
                    titles.set(walk, csvTitleToTag(titles.get(walk), walk));
                }
            } else {
                List<String> values = csvLineParse(line, delimiter);
                if (values.size() != titles.size()) {
                    if (values.size() == 1 && values.get(0).isEmpty())
                        continue;
                    throw new IOException(
                            String.format("Expected %d fields in CSV file on line %d", titles.size(), lineNumber));
                }
                writer.write(String.format("<csv-entry line=\"%d\">\n", lineNumber));
                for (int walk = 0; walk < titles.size(); walk++) {
                    writer.write(String.format("   <%s>%s</%s>\n", titles.get(walk), csvEscapeXML(values.get(walk)),
                            titles.get(walk)));
                }
                writer.write("</csv-entry>\n");
            }
            lineNumber++;
            showProgress();
        }
        writer.write("</csv-entries>\n");
        writer.close();
    }

    private void consumeXMLZipFile(InputStream inputStream, OutputStream outputStream)
            throws IOException, CancelException {
        ZipEntryXmlReader reader = new ZipEntryXmlReader(inputStream);
        Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        writer.write("<zip-entries>\n");
        while (true) {
            String line = reader.readLine();
            if (line == null)
                break;
            if (line.startsWith("<?xml"))
                continue;
            writer.write(line);
            writer.write("\n");
            showProgress();
            hasher.update(line);
        }
        writer.write("</zip-entries>\n");
        writer.close();
    }

    private void consumeXMLFile(InputStream inputStream, OutputStream outputStream)
            throws IOException, CancelException {
        boolean headerFound = false;
        byte[] buffer = new byte[BLOCK_SIZE];
        int bytesRead;
        while (-1 != (bytesRead = inputStream.read(buffer))) {
            if (!headerFound) {
                String chunk = new String(buffer, 0, buffer.length, "UTF-8");
                if (chunk.indexOf('<') > 0)
                    chunk = chunk.substring(chunk.indexOf('<'));
                if (!chunk.startsWith(XML_HEADER))
                    throw new IOException(String.format("Not an XML File. Must begin with '%s...'.", XML_HEADER));
                headerFound = true;
            }
            outputStream.write(buffer, 0, bytesRead);
            showProgress();
            hasher.update(buffer, bytesRead);
        }
    }

    private void showProgress() throws CancelException {
        progressListener.setProgress((int) (countingInputStream.getByteCount() / 1024));
    }
}