List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileIterator SequenceFileIterator
public SequenceFileIterator(Path path, boolean reuseKeyValueInstances, Configuration conf) throws IOException
From source file:com.netease.news.utils.SequenceFileDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w . j av a 2 s . c o m*/ addOutputOption(); addOption("substring", "b", "The number of chars to print out per value", false); addOption(buildOption("count", "c", "Report the count only", false, false, null)); addOption("numItems", "n", "Output at most <n> key value pairs", false); addOption( buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, " + "this can take up a fair amount of memory", false, false, null)); addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); Path input = getInputPath(); FileSystem fs = input.getFileSystem(conf); if (fs.getFileStatus(input).isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter())); } else { pathArr = new Path[1]; pathArr[0] = input; } Writer writer; boolean shouldClose; if (hasOption("output")) { shouldClose = true; writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { for (Path path : pathArr) { if (!hasOption("quiet")) { writer.append("Input Path: ").append(String.valueOf(path)).append('\n'); } int sub = Integer.MAX_VALUE; if (hasOption("substring")) { sub = Integer.parseInt(getOption("substring")); } boolean countOnly = hasOption("count"); SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf); if (!hasOption("quiet")) { writer.append("Key class: ").append(iterator.getKeyClass().toString()); writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n'); } OpenObjectIntHashMap<String> facets = null; if (hasOption("facets")) { facets = new OpenObjectIntHashMap<String>(); } long count = 0; if (countOnly) { while (iterator.hasNext()) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } writer.append("Count: ").append(String.valueOf(count)).append('\n'); } else { long numItems = Long.MAX_VALUE; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (!hasOption("quiet")) { writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n"); } } while (iterator.hasNext() && count < numItems) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); writer.append("Key: ").append(key); String str = record.getSecond().toString(); writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str); writer.write('\n'); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } if (!hasOption("quiet")) { writer.append("Count: ").append(String.valueOf(count)).append('\n'); } } if (facets != null) { List<String> keyList = Lists.newArrayListWithCapacity(facets.size()); IntArrayList valueList = new IntArrayList(facets.size()); facets.pairsSortedByKey(keyList, valueList); writer.append("-----Facets---\n"); writer.append("Key\t\tCount\n"); int i = 0; for (String key : keyList) { writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n'); } } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. */// w w w . j av a 2 s. co m public void splitFile(Path inputFile) throws IOException { Configuration conf = getConf(); FileSystem fs = inputFile.getFileSystem(conf); if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f); } log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(), testSplitSize, testRandomSelectionPct); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * testSplitPct / 100.0f); log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize, testSplitPct); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * splitLocation / 100.0f); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", inputFile.getName(), testSplitStart, splitLocation); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if (lineCount - testSplitSize < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", inputFile, testSplitSize, lineCount - testSplitSize); } } int trainCount = 0; int testCount = 0; if (!useSequence) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); try { String line; int pos = 0; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } } finally { Closeables.close(reader, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } else { SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>( inputFile, false, fs.getConf()); SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile, iterator.getKeyClass(), iterator.getValueClass()); SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile, iterator.getKeyClass(), iterator.getValueClass()); try { int pos = 0; while (iterator.hasNext()) { pos++; SequenceFile.Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } Pair<Writable, Writable> pair = iterator.next(); writer.append(pair.getFirst(), pair.getSecond()); } } finally { Closeables.close(iterator, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount, trainCount, testCount, testSplitStart); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }
From source file:com.twitter.algebra.AlgebraCommon.java
License:Apache License
/** * Read a vector from the filesystem and covert it to a dense vector * TODO: how about sparse vectors//from w w w .j a v a 2 s. c o m * @param vectorFile that file that contains the vector data in SequenceFile format * @param conf * @return a dense vector * @throws IOException */ public static DenseVector toDenseVector(Path vectorFile, Configuration conf) throws IOException { SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>( vectorFile, true, conf); DenseVector vector; try { Pair<IntWritable, VectorWritable> next; next = iterator.next(); vector = new DenseVector(next.getSecond().get()); } finally { Closeables.close(iterator, false); } return vector; }
From source file:org.qcri.pca.MeanAndSpanJob.java
/** * After running the job, we can load the results from HDFS with this method * /*from w w w . j av a 2s . c om*/ * @param meanSpanPath * the path to the single file having the results * @param normalizeMean * normalize the mean as well * @param conf * the configuration * @throws IOException * when face problem parsing the result file */ public void loadResults(Path meanSpanPath, boolean normalizeMean, Configuration conf) throws IOException { SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>( meanSpanPath, true, conf); try { Pair<IntWritable, VectorWritable> next; next = iterator.next(); if (next.getFirst().get() == MEANVECTOR) meanVector = next.getSecond().get(); else spanVector = next.getSecond().get(); next = iterator.next(); if (next.getFirst().get() == MEANVECTOR) meanVector = next.getSecond().get(); else spanVector = next.getSecond().get(); } finally { Closeables.close(iterator, false); } if (normalizeMean) meanVector.assign(spanVector, new DoubleDoubleFunction() { @Override public double apply(double v, double span) { return v / (span != 0 ? span : 1); } }); }
From source file:org.qcri.pca.Norm2Job.java
public double loadResult(Path outputDirPath, Configuration conf) throws IOException { Path finalNumberFile = new Path(outputDirPath, "part-r-00000"); SequenceFileIterator<NullWritable, DoubleWritable> iterator = new SequenceFileIterator<NullWritable, DoubleWritable>( finalNumberFile, true, conf); double norm2; try {/* w w w . ja v a2s . co m*/ Pair<NullWritable, DoubleWritable> next = iterator.next(); norm2 = next.getSecond().get(); if (iterator.hasNext()) throw new IOException("More than one value after norm2Job!"); } finally { Closeables.close(iterator, false); } return norm2; }
From source file:org.qcri.pca.PCACommon.java
static DenseVector toDenseVector(Path vectorFile, Configuration conf) throws IOException { SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>( vectorFile, true, conf);// w w w .ja va2 s .c om DenseVector vector; try { Pair<IntWritable, VectorWritable> next; next = iterator.next(); vector = new DenseVector(next.getSecond().get()); } finally { Closeables.close(iterator, false); } return vector; }
From source file:org.qcri.pca.ReconstructionErrJob.java
public void loadResults(Path outDirPath, Configuration conf) throws IOException { Path finalNumberFile = new Path(outDirPath, "part-r-00000"); SequenceFileIterator<IntWritable, DoubleWritable> iterator = new SequenceFileIterator<IntWritable, DoubleWritable>( finalNumberFile, true, conf); try {/* ww w . j a v a 2s . co m*/ while (iterator.hasNext()) { Pair<IntWritable, DoubleWritable> next = iterator.next(); readIndividualResult(next.getFirst().get(), next.getSecond().get()); } } finally { Closeables.close(iterator, false); } }
From source file:org.qcri.pca.VarianceJob.java
public void loadResult(Path outDirPath, Configuration conf) throws IOException { Path finalNumberFile = new Path(outDirPath, "part-r-00000"); SequenceFileIterator<NullWritable, DoubleWritable> iterator = new SequenceFileIterator<NullWritable, DoubleWritable>( finalNumberFile, true, conf); try {/*from w w w . jav a 2 s . com*/ Pair<NullWritable, DoubleWritable> next = iterator.next(); finalSum = next.getSecond().get(); } finally { Closeables.close(iterator, false); } }