Example usage for org.apache.hadoop.io IntWritable get

List of usage examples for org.apache.hadoop.io IntWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io IntWritable get.

Prototype

public int get() 

Source Link

Document

Return the value of this IntWritable.

Usage

From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java

License:Apache License

private Matrix readMatrix(Path dir) throws IOException {

    Matrix matrix = new SparseMatrix(new int[] { Integer.MAX_VALUE, Integer.MAX_VALUE });

    FileSystem fs = dir.getFileSystem(getConf());
    for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = seqFile.getPath();
        SequenceFile.Reader reader = null;
        try {//from w w  w.  j  a v a  2 s .c o m
            reader = new SequenceFile.Reader(fs, path, getConf());
            IntWritable key = new IntWritable();
            VectorWritable value = new VectorWritable();
            while (reader.next(key, value)) {
                int row = key.get();
                Iterator<Vector.Element> elementsIterator = value.get().iterateNonZero();
                while (elementsIterator.hasNext()) {
                    Vector.Element element = elementsIterator.next();
                    matrix.set(row, element.index(), element.get());
                }
            }
        } finally {
            IOUtils.quietClose(reader);
        }
    }
    return matrix;
}

From source file:org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    try {/*from   ww  w  . ja va  2 s  .  co m*/
        dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
        sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
        maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
        URI[] localFiles = DistributedCache.getCacheFiles(conf);
        if (localFiles == null || localFiles.length < 1) {
            throw new IllegalArgumentException("missing paths from the DistributedCache");
        }
        Path dictionaryFile = new Path(localFiles[0].getPath());
        FileSystem fs = dictionaryFile.getFileSystem(conf);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf);
        Writable key = new Text();
        IntWritable value = new IntWritable();

        // key is word value is id
        while (reader.next(key, value)) {
            dictionary.put(key.toString(), value.get());
        }
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
 * memory and will run at the speed of your disk read
 * /*from w w  w .j av  a2s. c o m*/
 * @param featureCountPath
 * @param dictionaryPathBase
 * @throws IOException
 */
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = new ArrayList<Path>();

    IntWritable key = new IntWritable();
    LongWritable value = new LongWritable();
    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath, OUTPUT_FILES_PATTERN));

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    long currentChunkSize = 0;
    long featureCount = 0;
    long vectorCount = Long.MAX_VALUE;
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (currentChunkSize > chunkSizeLimit) {
                freqWriter.close();
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
    }
    featureCount++;
    freqWriter.close();
    Long[] counts = { featureCount, vectorCount };
    return new Pair<Long[], List<Path>>(counts, chunkPaths);
}

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    try {/*from  www .  j ava 2 s.com*/
        Configuration conf = context.getConfiguration();
        URI[] localFiles = DistributedCache.getCacheFiles(conf);
        if (localFiles == null || localFiles.length < 1) {
            throw new IllegalArgumentException("missing paths from the DistributedCache");
        }

        vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1);
        featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1);
        minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
        maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99);
        sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);

        Path dictionaryFile = new Path(localFiles[0].getPath());
        FileSystem fs = dictionaryFile.getFileSystem(conf);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf);
        IntWritable key = new IntWritable();
        LongWritable value = new LongWritable();

        // key is feature, value is the document frequency
        while (reader.next(key, value)) {
            dictionary.put(key.get(), value.get());
        }
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.apache.mahout.vectorizer.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
 * memory and will run at the speed of your disk read
 *///from   w w  w  .j  ava 2  s.  co m
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    try {
        long currentChunkSize = 0;
        long featureCount = 0;
        long vectorCount = Long.MAX_VALUE;
        Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
        for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>(
                filesPattern, PathType.GLOB, null, null, true, conf)) {

            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(freqWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            IntWritable key = record.getFirst();
            LongWritable value = record.getSecond();
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
        featureCount++;
        Long[] counts = { featureCount, vectorCount };
        return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
        Closeables.close(freqWriter, false);
    }
}

From source file:org.apache.metron.integration.PcapParserIntegrationTest.java

License:Apache License

private static Map<String, byte[]> readPcaps(Path pcapFile) throws IOException {
    SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(), Reader.file(pcapFile));
    Map<String, byte[]> ret = new HashMap<>();
    IntWritable key = new IntWritable();
    BytesWritable value = new BytesWritable();
    PcapParser parser = new PcapParser();
    parser.init();//from w w w  .j  a va2 s .c  om
    while (reader.next(key, value)) {
        int keyInt = key.get();
        byte[] valueBytes = value.copyBytes();
        JSONObject message = parser.parse(valueBytes).get(0);
        if (parser.validate(message)) {
            ret.put(PcapUtils.getSessionKey(message), valueBytes);
        }
    }
    return ret;
}

From source file:org.apache.nutch.indexer.DeleteDuplicates.java

License:Apache License

/** Delete docs named in values from index named in key. */
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
        throws IOException {
    Path index = new Path(key.toString());
    IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
    try {/*from ww  w .j av a2  s .  c o  m*/
        while (values.hasNext()) {
            IntWritable value = (IntWritable) values.next();
            LOG.debug("-delete " + index + " doc=" + value);
            reader.deleteDocument(value.get());
        }
    } finally {
        reader.close();
    }
}

From source file:org.apache.nutch.scoring.depth.DepthScoringFilter.java

License:Apache License

@Override
public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
    // boost up by current depth
    int curDepth, curMaxDepth;
    IntWritable maxDepth = (IntWritable) datum.getMetaData().get(MAX_DEPTH_KEY_W);
    if (maxDepth != null) {
        curMaxDepth = maxDepth.get();
    } else {/* w w  w. j a  va2  s . co  m*/
        curMaxDepth = defaultMaxDepth;
    }
    IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
    if (depth == null) {
        // penalize
        curDepth = curMaxDepth;
    } else {
        curDepth = depth.get();
    }
    int mul = curMaxDepth - curDepth;
    return initSort * (1 + mul);
}

From source file:org.apache.nutch.scoring.depth.DepthScoringFilter.java

License:Apache License

@Override
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked)
        throws ScoringFilterException {
    // find a minimum of all depths
    int newDepth = DEFAULT_MAX_DEPTH;
    if (old != null) {
        IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
        if (oldDepth != null) {
            newDepth = oldDepth.get();
        } else {/*w w  w  . ja va 2 s. c  o  m*/
            // not set ?
            initialScore(url, old);
        }
    }
    for (CrawlDatum lnk : inlinked) {
        IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
        if (depth != null && depth.get() < newDepth) {
            newDepth = depth.get();
        }
    }
    datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
}

From source file:org.apache.nutch.scoring.orphan.OrphanScoringFilter.java

License:Apache License

public void orphanedScore(Text url, CrawlDatum datum) {
    // Already has an orphaned time?
    if (datum.getMetaData().containsKey(ORPHAN_KEY_WRITABLE)) {
        // Get the last time this hyperlink was inlinked
        IntWritable writable = (IntWritable) datum.getMetaData().get(ORPHAN_KEY_WRITABLE);
        int lastInlinkTime = writable.get();
        int now = (int) (System.currentTimeMillis() / 1000);
        int elapsedSinceLastInLinkTime = now - lastInlinkTime;

        if (elapsedSinceLastInLinkTime > markOrphanAfter) {
            // Mark as orphan so we can permanently delete it
            datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN);
        } else if (elapsedSinceLastInLinkTime > markGoneAfter) {
            // Mark as gone so the indexer can remove it
            datum.setStatus(CrawlDatum.STATUS_DB_GONE);
        }/*from   w  w w  . java 2s  . c o  m*/
    }
}