Example usage for org.apache.mahout.math.jet.random.sampling RandomSampler sample

Introduction

In this page you can find the example usage for org.apache.mahout.math.jet.random.sampling RandomSampler sample.

Prototype

public static void sample(long n, long N, int count, long low, long[] values, int fromIndex,
        Random randomGenerator)

Source Link

Document

Efficiently computes a sorted random set of count elements from the interval [low,low+N-1].

Usage

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Perform a split on the specified input file. Results will be written to files of the same name in the specified
 * training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *///from  w w w  .  jav  a  2  s  .  c o  m
public void splitFile(Path inputFile) throws IOException {
    Configuration conf = getConf();
    FileSystem fs = inputFile.getFileSystem(conf);
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    }
    if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f);
        }
        log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(),
                testSplitSize, testRandomSelectionPct);
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * testSplitPct / 100.0f);
            log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize,
                    testSplitPct);
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * splitLocation / 100.0f);
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}", inputFile.getName(),
                    testSplitStart, splitLocation);
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if (lineCount - testSplitSize < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    inputFile, testSplitSize, lineCount - testSplitSize);
        }
    }
    int trainCount = 0;
    int testCount = 0;
    if (!useSequence) {
        BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
        Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
        Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

        try {

            String line;
            int pos = 0;
            while ((line = reader.readLine()) != null) {
                pos++;

                Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                writer.write(line);
                writer.write('\n');
            }

        } finally {
            Closeables.close(reader, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    } else {
        SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>(
                inputFile, false, fs.getConf());
        SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        try {

            int pos = 0;
            while (iterator.hasNext()) {
                pos++;
                SequenceFile.Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                Pair<Writable, Writable> pair = iterator.next();
                writer.append(pair.getFirst(), pair.getSecond());
            }

        } finally {
            Closeables.close(iterator, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    }
    log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount,
            trainCount, testCount, testSplitStart);

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.tamingtext.util.SplitInput.java

License:Apache License

/** Perform a split on the specified input file. Results will be written to files of the same name in the specified 
 *  training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *//*from   w  ww . j  av a 2 s.c  o m*/
public void splitFile(Path inputFile) throws IOException {
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    } else if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * (testRandomSelectionPct / 100.0f));
        }
        log.info("{} test split size is {} based on random selection percentage {}",
                new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct });
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * (testSplitPct / 100.0f));
            log.info("{} test split size is {} based on percentage {}",
                    new Object[] { inputFile.getName(), testSplitSize, testSplitPct });
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * (splitLocation / 100.0f));
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}",
                    new Object[] { inputFile.getName(), testSplitStart, splitLocation });
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if ((lineCount - testSplitSize) < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    new Object[] { inputFile, testSplitSize, lineCount - testSplitSize });
        }
    }

    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
    Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
    Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

    int pos = 0;
    int trainCount = 0;
    int testCount = 0;

    String line;
    while ((line = reader.readLine()) != null) {
        pos++;

        Writer writer;
        if (testRandomSelectionPct > 0) { // Randomly choose
            writer = randomSel.get(pos) ? testWriter : trainingWriter;
        } else { // Choose based on location
            writer = pos > testSplitStart ? testWriter : trainingWriter;
        }

        if (writer == testWriter) {
            if (testCount >= testSplitSize) {
                writer = trainingWriter;
            } else {
                testCount++;
            }
        }

        if (writer == trainingWriter) {
            trainCount++;
        }

        writer.write(line);
        writer.write('\n');
    }

    IOUtils.close(Collections.singleton(trainingWriter));
    IOUtils.close(Collections.singleton(testWriter));

    log.info("file: {}, input: {} train: {}, test: {} starting at {}",
            new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart });

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}