Example usage for org.apache.mahout.common IOUtils close

List of usage examples for org.apache.mahout.common IOUtils close

Introduction

In this page you can find the example usage for org.apache.mahout.common IOUtils close.

Prototype

public static void close(Collection<? extends Closeable> closeables) throws IOException 

Source Link

Document

make sure to close all sources, log all of the problems occurred, clear closeables (to prevent repeating close attempts), re-throw the last one at the end.

Usage

From source file:com.tamingtext.tagging.LuceneCategoryExtractor.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir").withRequired(true)
            .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
            .withDescription("The Lucene directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(false)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(//w  ww .  j a  v a 2  s .  co  m
                    "The maximum number of documents to analyze.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true)
            .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create())
            .withDescription("The field in the index").withShortName("f").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(fieldOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File inputDir = new File(cmdLine.getValue(inputOpt).toString());

        if (!inputDir.isDirectory()) {
            throw new IllegalArgumentException(inputDir + " does not exist or is not a directory");
        }

        long maxDocs = Long.MAX_VALUE;
        if (cmdLine.hasOption(maxOpt)) {
            maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
        }

        if (maxDocs < 0) {
            throw new IllegalArgumentException("maxDocs must be >= 0");
        }

        String field = cmdLine.getValue(fieldOpt).toString();

        PrintWriter out = null;
        if (cmdLine.hasOption(outputOpt)) {
            out = new PrintWriter(new FileWriter(cmdLine.getValue(outputOpt).toString()));
        } else {
            out = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
        }

        dumpDocumentFields(inputDir, field, maxDocs, out);

        IOUtils.close(Collections.singleton(out));
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }

}

From source file:com.tamingtext.tagging.LuceneTagExtractor.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir").withRequired(true)
            .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
            .withDescription("The Lucene directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(false)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(//from   w w  w.ja  v a2  s  . co m
                    "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true)
            .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create())
            .withDescription("The field in the index").withShortName("f").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(fieldOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File file = new File(cmdLine.getValue(inputOpt).toString());

        if (!file.isDirectory()) {
            throw new IllegalArgumentException(file + " does not exist or is not a directory");
        }

        long maxDocs = Long.MAX_VALUE;
        if (cmdLine.hasOption(maxOpt)) {
            maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
        }

        if (maxDocs < 0) {
            throw new IllegalArgumentException("maxDocs must be >= 0");
        }

        String field = cmdLine.getValue(fieldOpt).toString();

        PrintWriter out = null;
        if (cmdLine.hasOption(outputOpt)) {
            out = new PrintWriter(new FileWriter(cmdLine.getValue(outputOpt).toString()));
        } else {
            out = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
        }

        File output = new File("/home/drew/taming-text/delicious/training");
        output.mkdirs();

        emitTextForTags(file, output);

        IOUtils.close(Collections.singleton(out));
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }

}

From source file:com.tamingtext.util.SplitInput.java

License:Apache License

/** Perform a split on the specified input file. Results will be written to files of the same name in the specified 
 *  training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *///from   ww  w .  j  a v  a 2s . c o  m
public void splitFile(Path inputFile) throws IOException {
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    } else if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * (testRandomSelectionPct / 100.0f));
        }
        log.info("{} test split size is {} based on random selection percentage {}",
                new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct });
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * (testSplitPct / 100.0f));
            log.info("{} test split size is {} based on percentage {}",
                    new Object[] { inputFile.getName(), testSplitSize, testSplitPct });
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * (splitLocation / 100.0f));
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}",
                    new Object[] { inputFile.getName(), testSplitStart, splitLocation });
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if ((lineCount - testSplitSize) < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    new Object[] { inputFile, testSplitSize, lineCount - testSplitSize });
        }
    }

    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
    Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
    Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

    int pos = 0;
    int trainCount = 0;
    int testCount = 0;

    String line;
    while ((line = reader.readLine()) != null) {
        pos++;

        Writer writer;
        if (testRandomSelectionPct > 0) { // Randomly choose
            writer = randomSel.get(pos) ? testWriter : trainingWriter;
        } else { // Choose based on location
            writer = pos > testSplitStart ? testWriter : trainingWriter;
        }

        if (writer == testWriter) {
            if (testCount >= testSplitSize) {
                writer = trainingWriter;
            } else {
                testCount++;
            }
        }

        if (writer == trainingWriter) {
            trainCount++;
        }

        writer.write(line);
        writer.write('\n');
    }

    IOUtils.close(Collections.singleton(trainingWriter));
    IOUtils.close(Collections.singleton(testWriter));

    log.info("file: {}, input: {} train: {}, test: {} starting at {}",
            new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart });

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.tamingtext.util.SplitInput.java

License:Apache License

/** Count the lines in the file specified as returned by <code>BufferedReader.readLine()</code>
 * /*from ww  w .  java2 s .  c  o  m*/
 * @param inputFile 
 *   the file whose lines will be counted
 *   
 * @param charset
 *   the charset of the file to read
 *   
 * @return the number of lines in the input file.
 * 
 * @throws IOException 
 *   if there is a problem opening or reading the file.
 */
public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException {
    int lineCount = 0;
    BufferedReader countReader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
    try {
        while (countReader.readLine() != null) {
            lineCount++;
        }
    } finally {
        IOUtils.close(Collections.singleton(countReader));
    }

    return lineCount;
}