Example usage for org.apache.mahout.common IOUtils close

Introduction

In this page you can find the example usage for org.apache.mahout.common IOUtils close.

Prototype

public static void close(Collection<? extends Closeable> closeables) throws IOException

Source Link

Document

make sure to close all sources, log all of the problems occurred, clear closeables (to prevent repeating close attempts), re-throw the last one at the end.

Usage

From source file:com.tamingtext.tagging.LuceneCategoryExtractor.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir").withRequired(true)
            .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
            .withDescription("The Lucene directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(false)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(//w  ww .  j a  v a 2  s .  co  m
                    "The maximum number of documents to analyze.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true)
            .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create())
            .withDescription("The field in the index").withShortName("f").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(fieldOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File inputDir = new File(cmdLine.getValue(inputOpt).toString());

        if (!inputDir.isDirectory()) {
            throw new IllegalArgumentException(inputDir + " does not exist or is not a directory");
        }

        long maxDocs = Long.MAX_VALUE;
        if (cmdLine.hasOption(maxOpt)) {
            maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
        }

        if (maxDocs < 0) {
            throw new IllegalArgumentException("maxDocs must be >= 0");
        }

        String field = cmdLine.getValue(fieldOpt).toString();

        PrintWriter out = null;
        if (cmdLine.hasOption(outputOpt)) {
            out = new PrintWriter(new FileWriter(cmdLine.getValue(outputOpt).toString()));
        } else {
            out = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
        }

        dumpDocumentFields(inputDir, field, maxDocs, out);

        IOUtils.close(Collections.singleton(out));
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }

}

From source file:com.tamingtext.tagging.LuceneTagExtractor.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir").withRequired(true)
            .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
            .withDescription("The Lucene directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(false)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(//from   w w  w.ja  v a2  s  . co m
                    "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true)
            .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create())
            .withDescription("The field in the index").withShortName("f").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(fieldOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File file = new File(cmdLine.getValue(inputOpt).toString());

        if (!file.isDirectory()) {
            throw new IllegalArgumentException(file + " does not exist or is not a directory");
        }

        long maxDocs = Long.MAX_VALUE;
        if (cmdLine.hasOption(maxOpt)) {
            maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
        }

        if (maxDocs < 0) {
            throw new IllegalArgumentException("maxDocs must be >= 0");
        }

        String field = cmdLine.getValue(fieldOpt).toString();

        PrintWriter out = null;
        if (cmdLine.hasOption(outputOpt)) {
            out = new PrintWriter(new FileWriter(cmdLine.getValue(outputOpt).toString()));
        } else {
            out = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
        }

        File output = new File("/home/drew/taming-text/delicious/training");
        output.mkdirs();

        emitTextForTags(file, output);

        IOUtils.close(Collections.singleton(out));
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }

}

From source file:com.tamingtext.util.SplitInput.java

License:Apache License

/** Perform a split on the specified input file. Results will be written to files of the same name in the specified 
 *  training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *///from   ww  w .  j  a v  a 2s . c o  m
public void splitFile(Path inputFile) throws IOException {
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    } else if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * (testRandomSelectionPct / 100.0f));
        }
        log.info("{} test split size is {} based on random selection percentage {}",
                new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct });
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * (testSplitPct / 100.0f));
            log.info("{} test split size is {} based on percentage {}",
                    new Object[] { inputFile.getName(), testSplitSize, testSplitPct });
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * (splitLocation / 100.0f));
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}",
                    new Object[] { inputFile.getName(), testSplitStart, splitLocation });
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if ((lineCount - testSplitSize) < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    new Object[] { inputFile, testSplitSize, lineCount - testSplitSize });
        }
    }

    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
    Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
    Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

    int pos = 0;
    int trainCount = 0;
    int testCount = 0;

    String line;
    while ((line = reader.readLine()) != null) {
        pos++;

        Writer writer;
        if (testRandomSelectionPct > 0) { // Randomly choose
            writer = randomSel.get(pos) ? testWriter : trainingWriter;
        } else { // Choose based on location
            writer = pos > testSplitStart ? testWriter : trainingWriter;
        }

        if (writer == testWriter) {
            if (testCount >= testSplitSize) {
                writer = trainingWriter;
            } else {
                testCount++;
            }
        }

        if (writer == trainingWriter) {
            trainCount++;
        }

        writer.write(line);
        writer.write('\n');
    }

    IOUtils.close(Collections.singleton(trainingWriter));
    IOUtils.close(Collections.singleton(testWriter));

    log.info("file: {}, input: {} train: {}, test: {} starting at {}",
            new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart });

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.tamingtext.util.SplitInput.java

License:Apache License

/** Count the lines in the file specified as returned by <code>BufferedReader.readLine()</code>
 * /*from ww  w .  java2 s .  c  o  m*/
 * @param inputFile 
 *   the file whose lines will be counted
 *   
 * @param charset
 *   the charset of the file to read
 *   
 * @return the number of lines in the input file.
 * 
 * @throws IOException 
 *   if there is a problem opening or reading the file.
 */
public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException {
    int lineCount = 0;
    BufferedReader countReader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
    try {
        while (countReader.readLine() != null) {
            lineCount++;
        }
    } finally {
        IOUtils.close(Collections.singleton(countReader));
    }

    return lineCount;
}