Example usage for org.apache.hadoop.io LongWritable get

List of usage examples for org.apache.hadoop.io LongWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io LongWritable get.

Prototype

public long get() 

Source Link

Document

Return the value of this LongWritable.

Usage

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
 * memory and will run at the speed of your disk read
 * // w ww.j  a va 2s.  c o m
 * @param featureCountPath
 * @param dictionaryPathBase
 * @throws IOException
 */
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = new ArrayList<Path>();

    IntWritable key = new IntWritable();
    LongWritable value = new LongWritable();
    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath, OUTPUT_FILES_PATTERN));

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    long currentChunkSize = 0;
    long featureCount = 0;
    long vectorCount = Long.MAX_VALUE;
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (currentChunkSize > chunkSizeLimit) {
                freqWriter.close();
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
    }
    featureCount++;
    freqWriter.close();
    Long[] counts = { featureCount, vectorCount };
    return new Pair<Long[], List<Path>>(counts, chunkPaths);
}

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    try {// w w  w .  j  a va2 s  .  c o  m
        Configuration conf = context.getConfiguration();
        URI[] localFiles = DistributedCache.getCacheFiles(conf);
        if (localFiles == null || localFiles.length < 1) {
            throw new IllegalArgumentException("missing paths from the DistributedCache");
        }

        vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1);
        featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1);
        minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
        maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99);
        sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);

        Path dictionaryFile = new Path(localFiles[0].getPath());
        FileSystem fs = dictionaryFile.getFileSystem(conf);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf);
        IntWritable key = new IntWritable();
        LongWritable value = new LongWritable();

        // key is feature, value is the document frequency
        while (reader.next(key, value)) {
            dictionary.put(key.get(), value.get());
        }
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.apache.mahout.vectorizer.term.TermCountCombiner.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context)
        throws IOException, InterruptedException {
    long sum = 0;
    for (LongWritable value : values) {
        sum += value.get();
    }//ww  w.  java 2 s . c om
    context.write(key, new LongWritable(sum));
}

From source file:org.apache.mahout.vectorizer.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
 * memory and will run at the speed of your disk read
 *//* w  w w  . ja v  a 2  s. c o  m*/
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        Configuration baseConf, int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    try {
        long currentChunkSize = 0;
        long featureCount = 0;
        long vectorCount = Long.MAX_VALUE;
        Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
        for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>(
                filesPattern, PathType.GLOB, null, null, true, conf)) {

            if (currentChunkSize > chunkSizeLimit) {
                Closeables.close(freqWriter, false);
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            IntWritable key = record.getFirst();
            LongWritable value = record.getSecond();
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
        featureCount++;
        Long[] counts = { featureCount, vectorCount };
        return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
        Closeables.close(freqWriter, false);
    }
}

From source file:org.apache.metron.spout.pcap.PartitionHDFSWriter.java

License:Apache License

public void handle(LongWritable ts, BytesWritable value) throws IOException {
    turnoverIfNecessary(ts.get());
    writer.append(ts, value);//from www  .j  av  a 2  s . co m
    syncHandler.sync(outputStream);
    numWritten++;
}

From source file:org.apache.metron.utils.PcapInspector.java

License:Apache License

public static void main(String... argv) throws IOException {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs();
    CommandLine cli = InspectorOptions.parse(new PosixParser(), otherArgs);
    Path inputPath = new Path(InspectorOptions.INPUT.get(cli));
    int n = -1;/*from w w w. j a v  a  2 s.  c om*/
    if (InspectorOptions.NUM.has(cli)) {
        n = Integer.parseInt(InspectorOptions.NUM.get(cli));
    }
    SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(),
            SequenceFile.Reader.file(inputPath));
    LongWritable key = new LongWritable();
    BytesWritable value = new BytesWritable();

    for (int i = 0; (n < 0 || i < n) && reader.next(key, value); ++i) {
        long millis = Long.divideUnsigned(key.get(), 1000000);
        String ts = DATE_FORMAT.format(new Date(millis));
        for (PacketInfo pi : PcapHelper.toPacketInfo(value.copyBytes())) {
            EnumMap<Constants.Fields, Object> result = PcapHelper.packetToFields(pi);
            List<String> fieldResults = new ArrayList<String>() {
                {
                    add("TS: " + ts);
                }
            };
            for (Constants.Fields field : Constants.Fields.values()) {
                if (result.containsKey(field)) {
                    fieldResults.add(field.getName() + ": " + result.get(field));
                }
            }
            System.out.println(Joiner.on(",").join(fieldResults));
        }
    }
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }/*from  ww  w .j av  a2  s  .co m*/

    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);
    job.setBoolean("db.reader.stats.sort", sort);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // https://issues.apache.org/jira/browse/NUTCH-1029
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);

    Text key = new Text();
    LongWritable value = new LongWritable();

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = stats.get("T");
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
            String k = entry.getKey();
            LongWritable val = entry.getValue();
            if (k.equals("scn")) {
                LOG.info("min score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else
                LOG.info(k + ":\t" + val);
        }
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }

}

From source file:org.apache.nutch.crawl.WebTableReader.java

License:Apache License

@Override
public Map<String, Object> run(Map<String, Object> args) throws Exception {
    Path tmpFolder = new Path(getConf().get("mapred.temp.dir", ".") + "stat_tmp" + System.currentTimeMillis());

    numJobs = 1;/*from w  ww  .j ava 2  s .c  om*/
    currentJob = new NutchJob(getConf(), "db_stats");

    currentJob.getConfiguration().setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Boolean sort = (Boolean) args.get(Nutch.ARG_SORT);
    if (sort == null)
        sort = Boolean.FALSE;
    currentJob.getConfiguration().setBoolean("db.reader.stats.sort", sort);

    DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class,
            WebPage.class);
    Query<String, WebPage> query = store.newQuery();

    // remove the __g__dirty field since it is not stored
    String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1, WebPage._ALL_FIELDS.length);
    query.setFields(fields);

    GoraMapper.initMapperJob(currentJob, query, store, Text.class, LongWritable.class, WebTableStatMapper.class,
            null, true);

    currentJob.setCombinerClass(WebTableStatCombiner.class);
    currentJob.setReducerClass(WebTableStatReducer.class);

    FileOutputFormat.setOutputPath(currentJob, tmpFolder);

    currentJob.setOutputFormatClass(SequenceFileOutputFormat.class);

    currentJob.setOutputKeyClass(Text.class);
    currentJob.setOutputValueClass(LongWritable.class);
    FileSystem fileSystem = FileSystem.get(getConf());

    try {
        currentJob.waitForCompletion(true);
    } finally {
        ToolUtil.recordJobStatus(null, currentJob, results);
        if (!currentJob.isSuccessful()) {
            fileSystem.delete(tmpFolder, true);
            return results;
        }
    }

    Text key = new Text();
    LongWritable value = new LongWritable();

    SequenceFile.Reader[] readers = org.apache.hadoop.mapred.SequenceFileOutputFormat.getReaders(getConf(),
            tmpFolder);

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    LongWritable totalCnt = stats.get("T");
    if (totalCnt == null)
        totalCnt = new LongWritable(0);
    stats.remove("T");
    results.put("TOTAL urls", totalCnt.get());
    for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
        String k = entry.getKey();
        LongWritable val = entry.getValue();
        if (k.equals("scn")) {
            results.put("min score", (val.get() / 1000.0f));
        } else if (k.equals("scx")) {
            results.put("max score", (val.get() / 1000.0f));
        } else if (k.equals("sct")) {
            results.put("avg score", (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
        } else if (k.startsWith("status")) {
            String[] st = k.split(" ");
            int code = Integer.parseInt(st[1]);
            if (st.length > 2)
                results.put(st[2], val.get());
            else
                results.put(st[0] + " " + code + " (" + CrawlStatus.getName((byte) code) + ")", val.get());
        } else
            results.put(k, val.get());
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);

    return results;
}

From source file:org.apache.nutch.mapreduce.WebTableReader.java

License:Apache License

@Override
protected void doRun(Map<String, Object> args) throws Exception {
    Path tmpFolder = new Path(getConf().get("mapred.temp.dir", ".") + "stat_tmp" + System.currentTimeMillis());

    currentJob.getConfiguration().setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Boolean sort = (Boolean) args.get(Nutch.ARG_SORT);
    if (sort == null)
        sort = Boolean.FALSE;//  www .j a va 2  s . c o  m
    currentJob.getConfiguration().setBoolean("db.reader.stats.sort", sort);

    DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class,
            WebPage.class);
    Query<String, WebPage> query = store.newQuery();

    // remove the __g__dirty field since it is not stored
    String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1, WebPage._ALL_FIELDS.length);
    query.setFields(fields);

    GoraMapper.initMapperJob(currentJob, query, store, Text.class, LongWritable.class, WebTableStatMapper.class,
            null, true);

    currentJob.setCombinerClass(WebTableStatCombiner.class);
    currentJob.setReducerClass(WebTableStatReducer.class);

    FileOutputFormat.setOutputPath(currentJob, tmpFolder);

    currentJob.setOutputFormatClass(SequenceFileOutputFormat.class);

    currentJob.setOutputKeyClass(Text.class);
    currentJob.setOutputValueClass(LongWritable.class);
    FileSystem fileSystem = FileSystem.get(getConf());

    try {
        currentJob.waitForCompletion(true);
    } finally {
        if (!currentJob.isSuccessful()) {
            fileSystem.delete(tmpFolder, true);
            return;
        }
    }

    Text key = new Text();
    LongWritable value = new LongWritable();

    SequenceFile.Reader[] readers = org.apache.hadoop.mapred.SequenceFileOutputFormat.getReaders(getConf(),
            tmpFolder);

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    LongWritable totalCnt = stats.get("T");
    if (totalCnt == null)
        totalCnt = new LongWritable(0);
    stats.remove("T");
    results.put("TOTAL urls", totalCnt.get());
    for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
        String k = entry.getKey();
        LongWritable val = entry.getValue();
        if (k.equals("scn")) {
            results.put("min score", (val.get() / 1000.0f));
        } else if (k.equals("scx")) {
            results.put("max score", (val.get() / 1000.0f));
        } else if (k.equals("sct")) {
            results.put("avg score", (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
        } else if (k.startsWith("status")) {
            String[] st = k.split(" ");
            int code = Integer.parseInt(st[1]);
            if (st.length > 2)
                results.put(st[2], val.get());
            else
                results.put(st[0] + " " + code + " (" + CrawlStatus.getName((byte) code) + ")", val.get());
        } else
            results.put(k, val.get());
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
}

From source file:org.apache.orc.mapred.TestOrcOutputFormat.java

License:Apache License

/**
 * Test the case where the top level isn't a struct, but a long.
 *//*from   w w  w. j  ava  2  s . c o  m*/
@Test
public void testLongRoot() throws Exception {
    conf.set("mapreduce.task.attempt.id", "attempt_20160101_0001_m_000001_0");
    conf.setOutputCommitter(NullOutputCommitter.class);
    conf.set(OrcConf.COMPRESS.getAttribute(), "SNAPPY");
    conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000);
    conf.setInt(OrcConf.BUFFER_SIZE.getAttribute(), 64 * 1024);
    conf.set(OrcConf.WRITE_FORMAT.getAttribute(), "0.11");
    final String typeStr = "bigint";
    OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr);
    FileOutputFormat.setOutputPath(conf, workDir);
    TypeDescription type = TypeDescription.fromString(typeStr);
    LongWritable value = new LongWritable();
    NullWritable nada = NullWritable.get();
    RecordWriter<NullWritable, LongWritable> writer = new OrcOutputFormat<LongWritable>().getRecordWriter(fs,
            conf, "long.orc", Reporter.NULL);
    for (long lo = 0; lo < 2000; ++lo) {
        value.set(lo);
        writer.write(nada, value);
    }
    writer.close(Reporter.NULL);

    Path path = new Path(workDir, "long.orc");
    Reader file = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    assertEquals(CompressionKind.SNAPPY, file.getCompressionKind());
    assertEquals(2000, file.getNumberOfRows());
    assertEquals(1000, file.getRowIndexStride());
    assertEquals(64 * 1024, file.getCompressionSize());
    assertEquals(OrcFile.Version.V_0_11, file.getFileVersion());
    FileSplit split = new FileSplit(path, 0, 100000, new String[0]);
    RecordReader<NullWritable, LongWritable> reader = new OrcInputFormat<LongWritable>().getRecordReader(split,
            conf, Reporter.NULL);
    nada = reader.createKey();
    value = reader.createValue();
    for (long lo = 0; lo < 2000; ++lo) {
        assertEquals(true, reader.next(nada, value));
        assertEquals(lo, value.get());
    }
    assertEquals(false, reader.next(nada, value));
}