edu.jhu.hlt.concrete.ingesters.simple.CompleteFileIngester.java Source code

Java tutorial

Introduction

Here is the source code for edu.jhu.hlt.concrete.ingesters.simple.CompleteFileIngester.java

Source

/*
 * Copyright 2012-2015 Johns Hopkins University HLTCOE. All rights reserved.
 * See LICENSE in the project root directory.
 */

package edu.jhu.hlt.concrete.ingesters.simple;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.TextSpan;
import edu.jhu.hlt.concrete.communications.WritableCommunication;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.UTF8FileIngester;
import edu.jhu.hlt.concrete.metadata.tools.TooledMetadataConverter;
import edu.jhu.hlt.concrete.section.SingleSectionSegmenter;
import edu.jhu.hlt.concrete.util.ConcreteException;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.util.Timing;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory;
import edu.jhu.hlt.concrete.uuid.AnalyticUUIDGeneratorFactory.AnalyticUUIDGenerator;
import edu.jhu.hlt.utilt.io.ExistingNonDirectoryFile;
import edu.jhu.hlt.utilt.io.NotFileException;

/**
 * Implementation of {@link UTF8FileIngester} whose {@link UTF8FileIngester#fromCharacterBasedFile(Path)}
 * implementation converts the entire contents of a
 * character-based file to a {@link Communication} object.
 * <ul>
 *  <li>
 *   The file name is used as the ID of the Communication.
 *  </li>
 *  <li>
 *   The Communication will contain one {@link Section} with one {@link TextSpan}.
 *  </li>
 * </ul>
 */
public class CompleteFileIngester implements UTF8FileIngester {

    private static final Logger logger = LoggerFactory.getLogger(CompleteFileIngester.class);

    private final String kind;

    private final long ts;

    /**
     * @param kind the kind of produced {@link Communication} objects
     */
    public CompleteFileIngester(String kind) {
        this.kind = kind;
        this.ts = Timing.currentLocalTime();
    }

    /* (non-Javadoc)
     * @see edu.jhu.hlt.concrete.ingesters.base.FileIngester#fromCharacterBasedFile(java.nio.file.Path, java.nio.charset.Charset)
     */
    @Override
    public Communication fromCharacterBasedFile(Path path) throws IngestException {
        try {
            ExistingNonDirectoryFile f = new ExistingNonDirectoryFile(path);
            try (InputStream is = Files.newInputStream(f.getPath());) {
                String content = IOUtils.toString(is, StandardCharsets.UTF_8);
                AnalyticUUIDGeneratorFactory fact = new AnalyticUUIDGeneratorFactory();
                AnalyticUUIDGenerator g = fact.create();
                Communication c = new Communication();
                c.setId(f.getName());
                c.setUuid(g.next());
                c.setText(content);
                c.setType(this.getKind());
                c.setMetadata(TooledMetadataConverter.convert(this));
                Section s = SingleSectionSegmenter.createSingleSection(c, "Other");
                c.addToSectionList(s);
                return c;
            } catch (IOException e) {
                throw new IngestException("Caught exception reading in document.", e);
            } catch (ConcreteException e) {
                throw new IngestException("Caught exception reading in document.", e);
            }
        } catch (NoSuchFileException | NotFileException e) {
            throw new IngestException("Path did not exist or was a directory.", e);
        }
    }

    /**
     * See usage string.
     *
     * @param args
     */
    public static void main(String[] args) {
        if (args.length != 3) {
            System.err.println("This program converts a character-based file to a .concrete file.");
            System.err.println("The text file must contain UTF-8 encoded characters.");
            System.err.println(
                    "The .concrete file will share the same name as the input file, including the extension.");
            System.err.println("This program takes 3 arguments.");
            System.err.println("Argument 1: path/to/a/character/based/file");
            System.err.println("Argument 2: type of Communication to generate [e.g., tweet]");
            System.err.println("Argument 3: path/to/output/folder");
            System.err.println("Example usage: " + CompleteFileIngester.class.getName()
                    + " /my/text/file story /my/output/folder");
            System.exit(1);
        }

        String inPathStr = args[0];
        Path inPath = Paths.get(inPathStr);
        try {
            ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(inPath);
            Optional<String> commType = Optional.ofNullable(args[1]);
            Optional<String> outPathStr = Optional.ofNullable(args[2]);

            Path ep = ef.getPath();
            String fn = ef.getName();
            Path outPath = Paths.get(outPathStr.get());
            Path outFile = outPath.resolve(fn + ".concrete");

            // Output directory exists, or it doesn't.
            // Try to create if it does not.
            if (!Files.exists(outPath)) {
                try {
                    Files.createDirectories(outPath);
                } catch (IOException e) {
                    logger.error("Caught exception when making output directories.", e);
                }

                // if it does, check to make sure it's a directory.
            } else {
                if (!Files.isDirectory(outPath)) {
                    logger.error("Output path exists but is not a directory.");
                    System.exit(1);
                } else {
                    // check to make sure the output file won't be overwritten.
                    if (Files.exists(outFile)) {
                        logger.warn("Output file {} exists; not overwriting.", outFile.toString());
                        System.exit(1);
                    }
                }
            }

            try {
                UTF8FileIngester ing = new CompleteFileIngester(commType.get());
                Communication comm = ing.fromCharacterBasedFile(ep);
                new WritableCommunication(comm).writeToFile(outFile, false);
            } catch (IngestException e) {
                logger.error("Caught exception during ingest.", e);
                System.exit(1);
            } catch (ConcreteException e) {
                logger.error("Caught exception writing output.", e);
            }

        } catch (NoSuchFileException e) {
            logger.error("Path {} does not exist.", inPathStr);
            System.exit(1);
        } catch (NotFileException e) {
            logger.error("Path {} is a directory.", inPathStr);
            System.exit(1);
        }
    }

    /*
     * (non-Javadoc)
     * @see edu.jhu.hlt.concrete.ingesters.base.Ingester#getKind()
     */
    @Override
    public String getKind() {
        return this.kind;
    }

    /*
     * (non-Javadoc)
     * @see edu.jhu.hlt.concrete.safe.metadata.SafeAnnotationMetadata#getTimestamp()
     */
    @Override
    public long getTimestamp() {
        return this.ts;
    }

    /*
     * (non-Javadoc)
     * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolName()
     */
    @Override
    public String getToolName() {
        return this.getClass().getName() + " [Project: concrete-ingesters-simple]";
    }

    /*
     * (non-Javadoc)
     * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolVersion()
     */
    @Override
    public String getToolVersion() {
        return ProjectConstants.VERSION;
    }

    /*
     * (non-Javadoc)
     * @see edu.jhu.hlt.concrete.metadata.tools.MetadataTool#getToolNotes()
     */
    @Override
    public List<String> getToolNotes() {
        List<String> sl = new ArrayList<String>();
        sl.add("Communication kind: " + this.kind);
        return sl;
    }
}