List of usage examples for org.apache.mahout.common IOUtils close
public static void close(Collection<? extends Closeable> closeables) throws IOException
From source file:com.tamingtext.tagging.LuceneCategoryExtractor.java
License:Apache License
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("dir").withRequired(true) .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) .withDescription("The Lucene directory").withShortName("d").create(); Option outputOpt = obuilder.withLongName("output").withRequired(false) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory").withShortName("o").create(); Option maxOpt = obuilder.withLongName("max").withRequired(false) .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create()) .withDescription(//w ww . j a v a 2 s . co m "The maximum number of documents to analyze. If not specified, then it will loop over all docs") .withShortName("m").create(); Option fieldOpt = obuilder.withLongName("field").withRequired(true) .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create()) .withDescription("The field in the index").withShortName("f").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt) .withOption(fieldOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File inputDir = new File(cmdLine.getValue(inputOpt).toString()); if (!inputDir.isDirectory()) { throw new IllegalArgumentException(inputDir + " does not exist or is not a directory"); } long maxDocs = Long.MAX_VALUE; if (cmdLine.hasOption(maxOpt)) { maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString()); } if (maxDocs < 0) { throw new IllegalArgumentException("maxDocs must be >= 0"); } String field = cmdLine.getValue(fieldOpt).toString(); PrintWriter out = null; if (cmdLine.hasOption(outputOpt)) { out = new PrintWriter(new FileWriter(cmdLine.getValue(outputOpt).toString())); } else { out = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8")); } dumpDocumentFields(inputDir, field, maxDocs, out); IOUtils.close(Collections.singleton(out)); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
From source file:com.tamingtext.tagging.LuceneTagExtractor.java
License:Apache License
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("dir").withRequired(true) .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) .withDescription("The Lucene directory").withShortName("d").create(); Option outputOpt = obuilder.withLongName("output").withRequired(false) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory").withShortName("o").create(); Option maxOpt = obuilder.withLongName("max").withRequired(false) .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create()) .withDescription(//from w w w.ja v a2 s . co m "The maximum number of vectors to output. If not specified, then it will loop over all docs") .withShortName("m").create(); Option fieldOpt = obuilder.withLongName("field").withRequired(true) .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create()) .withDescription("The field in the index").withShortName("f").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt) .withOption(fieldOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File file = new File(cmdLine.getValue(inputOpt).toString()); if (!file.isDirectory()) { throw new IllegalArgumentException(file + " does not exist or is not a directory"); } long maxDocs = Long.MAX_VALUE; if (cmdLine.hasOption(maxOpt)) { maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString()); } if (maxDocs < 0) { throw new IllegalArgumentException("maxDocs must be >= 0"); } String field = cmdLine.getValue(fieldOpt).toString(); PrintWriter out = null; if (cmdLine.hasOption(outputOpt)) { out = new PrintWriter(new FileWriter(cmdLine.getValue(outputOpt).toString())); } else { out = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8")); } File output = new File("/home/drew/taming-text/delicious/training"); output.mkdirs(); emitTextForTags(file, output); IOUtils.close(Collections.singleton(out)); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
From source file:com.tamingtext.util.SplitInput.java
License:Apache License
/** Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. *///from ww w . j a v a 2s . c o m public void splitFile(Path inputFile) throws IOException { if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } else if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * (testRandomSelectionPct / 100.0f)); } log.info("{} test split size is {} based on random selection percentage {}", new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct }); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * (testSplitPct / 100.0f)); log.info("{} test split size is {} based on percentage {}", new Object[] { inputFile.getName(), testSplitSize, testSplitPct }); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * (splitLocation / 100.0f)); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", new Object[] { inputFile.getName(), testSplitStart, splitLocation }); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if ((lineCount - testSplitSize) < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", new Object[] { inputFile, testSplitSize, lineCount - testSplitSize }); } } BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); int pos = 0; int trainCount = 0; int testCount = 0; String line; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } IOUtils.close(Collections.singleton(trainingWriter)); IOUtils.close(Collections.singleton(testWriter)); log.info("file: {}, input: {} train: {}, test: {} starting at {}", new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart }); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }
From source file:com.tamingtext.util.SplitInput.java
License:Apache License
/** Count the lines in the file specified as returned by <code>BufferedReader.readLine()</code> * /*from ww w . java2 s . c o m*/ * @param inputFile * the file whose lines will be counted * * @param charset * the charset of the file to read * * @return the number of lines in the input file. * * @throws IOException * if there is a problem opening or reading the file. */ public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException { int lineCount = 0; BufferedReader countReader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); try { while (countReader.readLine() != null) { lineCount++; } } finally { IOUtils.close(Collections.singleton(countReader)); } return lineCount; }