Example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileIterator SequenceFileIterator

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileIterator SequenceFileIterator.

Prototype


public SequenceFileIterator(Path path, boolean reuseKeyValueInstances, Configuration conf) throws IOException

Source Link

Usage

From source file:com.netease.news.utils.SequenceFileDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from   w  w w .  j  av  a  2  s  . c  o  m*/
    addOutputOption();
    addOption("substring", "b", "The number of chars to print out per value", false);
    addOption(buildOption("count", "c", "Report the count only", false, false, null));
    addOption("numItems", "n", "Output at most <n> key value pairs", false);
    addOption(
            buildOption("facets", "fa", "Output the counts per key.  Note, if there are a lot of unique keys, "
                    + "this can take up a fair amount of memory", false, false, null));
    addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    Path input = getInputPath();
    FileSystem fs = input.getFileSystem(conf);
    if (fs.getFileStatus(input).isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        pathArr = new Path[1];
        pathArr[0] = input;
    }

    Writer writer;
    boolean shouldClose;
    if (hasOption("output")) {
        shouldClose = true;
        writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        for (Path path : pathArr) {
            if (!hasOption("quiet")) {
                writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
            }

            int sub = Integer.MAX_VALUE;
            if (hasOption("substring")) {
                sub = Integer.parseInt(getOption("substring"));
            }
            boolean countOnly = hasOption("count");
            SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true,
                    conf);
            if (!hasOption("quiet")) {
                writer.append("Key class: ").append(iterator.getKeyClass().toString());
                writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
            }
            OpenObjectIntHashMap<String> facets = null;
            if (hasOption("facets")) {
                facets = new OpenObjectIntHashMap<String>();
            }
            long count = 0;
            if (countOnly) {
                while (iterator.hasNext()) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                writer.append("Count: ").append(String.valueOf(count)).append('\n');
            } else {
                long numItems = Long.MAX_VALUE;
                if (hasOption("numItems")) {
                    numItems = Long.parseLong(getOption("numItems"));
                    if (!hasOption("quiet")) {
                        writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
                    }
                }
                while (iterator.hasNext() && count < numItems) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    writer.append("Key: ").append(key);
                    String str = record.getSecond().toString();
                    writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
                    writer.write('\n');
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                if (!hasOption("quiet")) {
                    writer.append("Count: ").append(String.valueOf(count)).append('\n');
                }
            }
            if (facets != null) {
                List<String> keyList = Lists.newArrayListWithCapacity(facets.size());

                IntArrayList valueList = new IntArrayList(facets.size());
                facets.pairsSortedByKey(keyList, valueList);
                writer.append("-----Facets---\n");
                writer.append("Key\t\tCount\n");
                int i = 0;
                for (String key : keyList) {
                    writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
                }
            }
        }
        writer.flush();

    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Perform a split on the specified input file. Results will be written to files of the same name in the specified
 * training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *///  w w  w . j av  a 2  s. co  m
public void splitFile(Path inputFile) throws IOException {
    Configuration conf = getConf();
    FileSystem fs = inputFile.getFileSystem(conf);
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    }
    if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f);
        }
        log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(),
                testSplitSize, testRandomSelectionPct);
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * testSplitPct / 100.0f);
            log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize,
                    testSplitPct);
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * splitLocation / 100.0f);
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}", inputFile.getName(),
                    testSplitStart, splitLocation);
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if (lineCount - testSplitSize < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    inputFile, testSplitSize, lineCount - testSplitSize);
        }
    }
    int trainCount = 0;
    int testCount = 0;
    if (!useSequence) {
        BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
        Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
        Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

        try {

            String line;
            int pos = 0;
            while ((line = reader.readLine()) != null) {
                pos++;

                Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                writer.write(line);
                writer.write('\n');
            }

        } finally {
            Closeables.close(reader, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    } else {
        SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>(
                inputFile, false, fs.getConf());
        SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        try {

            int pos = 0;
            while (iterator.hasNext()) {
                pos++;
                SequenceFile.Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                Pair<Writable, Writable> pair = iterator.next();
                writer.append(pair.getFirst(), pair.getSecond());
            }

        } finally {
            Closeables.close(iterator, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    }
    log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount,
            trainCount, testCount, testSplitStart);

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/**
 * Read a vector from the filesystem and covert it to a dense vector
 * TODO: how about sparse vectors//from  w  w  w  .j  a v a 2 s.  c  o  m
 * @param vectorFile that file that contains the vector data in SequenceFile format
 * @param conf
 * @return a dense vector
 * @throws IOException
 */
public static DenseVector toDenseVector(Path vectorFile, Configuration conf) throws IOException {
    SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>(
            vectorFile, true, conf);
    DenseVector vector;
    try {
        Pair<IntWritable, VectorWritable> next;
        next = iterator.next();
        vector = new DenseVector(next.getSecond().get());
    } finally {
        Closeables.close(iterator, false);
    }
    return vector;
}

From source file:org.qcri.pca.MeanAndSpanJob.java

/**
 * After running the job, we can load the results from HDFS with this method
 * /*from  w  w  w . j  av  a 2s  . c  om*/
 * @param meanSpanPath
 *          the path to the single file having the results
 * @param normalizeMean
 *          normalize the mean as well
 * @param conf
 *          the configuration
 * @throws IOException
 *           when face problem parsing the result file
 */
public void loadResults(Path meanSpanPath, boolean normalizeMean, Configuration conf) throws IOException {
    SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>(
            meanSpanPath, true, conf);
    try {
        Pair<IntWritable, VectorWritable> next;
        next = iterator.next();
        if (next.getFirst().get() == MEANVECTOR)
            meanVector = next.getSecond().get();
        else
            spanVector = next.getSecond().get();
        next = iterator.next();
        if (next.getFirst().get() == MEANVECTOR)
            meanVector = next.getSecond().get();
        else
            spanVector = next.getSecond().get();
    } finally {
        Closeables.close(iterator, false);
    }
    if (normalizeMean)
        meanVector.assign(spanVector, new DoubleDoubleFunction() {
            @Override
            public double apply(double v, double span) {
                return v / (span != 0 ? span : 1);
            }
        });
}

From source file:org.qcri.pca.Norm2Job.java

public double loadResult(Path outputDirPath, Configuration conf) throws IOException {
    Path finalNumberFile = new Path(outputDirPath, "part-r-00000");
    SequenceFileIterator<NullWritable, DoubleWritable> iterator = new SequenceFileIterator<NullWritable, DoubleWritable>(
            finalNumberFile, true, conf);
    double norm2;
    try {/* w w  w . ja v a2s . co m*/
        Pair<NullWritable, DoubleWritable> next = iterator.next();
        norm2 = next.getSecond().get();
        if (iterator.hasNext())
            throw new IOException("More than one value after norm2Job!");
    } finally {
        Closeables.close(iterator, false);
    }
    return norm2;
}

From source file:org.qcri.pca.PCACommon.java

static DenseVector toDenseVector(Path vectorFile, Configuration conf) throws IOException {
    SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>(
            vectorFile, true, conf);//  w  w w .ja va2 s .c om
    DenseVector vector;
    try {
        Pair<IntWritable, VectorWritable> next;
        next = iterator.next();
        vector = new DenseVector(next.getSecond().get());
    } finally {
        Closeables.close(iterator, false);
    }
    return vector;
}

From source file:org.qcri.pca.ReconstructionErrJob.java

public void loadResults(Path outDirPath, Configuration conf) throws IOException {
    Path finalNumberFile = new Path(outDirPath, "part-r-00000");
    SequenceFileIterator<IntWritable, DoubleWritable> iterator = new SequenceFileIterator<IntWritable, DoubleWritable>(
            finalNumberFile, true, conf);
    try {/* ww w .  j  a v a 2s .  co m*/
        while (iterator.hasNext()) {
            Pair<IntWritable, DoubleWritable> next = iterator.next();
            readIndividualResult(next.getFirst().get(), next.getSecond().get());
        }
    } finally {
        Closeables.close(iterator, false);
    }
}

From source file:org.qcri.pca.VarianceJob.java

public void loadResult(Path outDirPath, Configuration conf) throws IOException {
    Path finalNumberFile = new Path(outDirPath, "part-r-00000");
    SequenceFileIterator<NullWritable, DoubleWritable> iterator = new SequenceFileIterator<NullWritable, DoubleWritable>(
            finalNumberFile, true, conf);
    try {/*from w w w  . jav a  2  s .  com*/
        Pair<NullWritable, DoubleWritable> next = iterator.next();
        finalSum = next.getSecond().get();
    } finally {
        Closeables.close(iterator, false);
    }
}