List of usage examples for org.apache.hadoop.io LongWritable get
public long get()
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java
License:Apache License
/** * Read the document frequency List which is built at the end of the DF Count Job. This will use constant * memory and will run at the speed of your disk read * // w ww.j a va 2s. c o m * @param featureCountPath * @param dictionaryPathBase * @throws IOException */ private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = new ArrayList<Path>(); IntWritable key = new IntWritable(); LongWritable value = new LongWritable(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath, OUTPUT_FILES_PATTERN)); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // key is feature value is count while (reader.next(key, value)) { if (currentChunkSize > chunkSizeLimit) { freqWriter.close(); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } } featureCount++; freqWriter.close(); Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); }
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); try {// w w w . j a va2 s . c o m Configuration conf = context.getConfiguration(); URI[] localFiles = DistributedCache.getCacheFiles(conf); if (localFiles == null || localFiles.length < 1) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); Path dictionaryFile = new Path(localFiles[0].getPath()); FileSystem fs = dictionaryFile.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf); IntWritable key = new IntWritable(); LongWritable value = new LongWritable(); // key is feature, value is the document frequency while (reader.next(key, value)) { dictionary.put(key.get(), value.get()); } } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.apache.mahout.vectorizer.term.TermCountCombiner.java
License:Apache License
@Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum = 0; for (LongWritable value : values) { sum += value.get(); }//ww w. java 2 s . c om context.write(key, new LongWritable(sum)); }
From source file:org.apache.mahout.vectorizer.tfidf.TFIDFConverter.java
License:Apache License
/** * Read the document frequency List which is built at the end of the DF Count Job. This will use constant * memory and will run at the speed of your disk read *//* w w w . ja v a 2 s. c o m*/ private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); try { long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>( filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(freqWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; IntWritable key = record.getFirst(); LongWritable value = record.getSecond(); if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } featureCount++; Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); } finally { Closeables.close(freqWriter, false); } }
From source file:org.apache.metron.spout.pcap.PartitionHDFSWriter.java
License:Apache License
public void handle(LongWritable ts, BytesWritable value) throws IOException { turnoverIfNecessary(ts.get()); writer.append(ts, value);//from www .j av a 2 s . co m syncHandler.sync(outputStream); numWritten++; }
From source file:org.apache.metron.utils.PcapInspector.java
License:Apache License
public static void main(String... argv) throws IOException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); CommandLine cli = InspectorOptions.parse(new PosixParser(), otherArgs); Path inputPath = new Path(InspectorOptions.INPUT.get(cli)); int n = -1;/*from w w w. j a v a 2 s. c om*/ if (InspectorOptions.NUM.has(cli)) { n = Integer.parseInt(InspectorOptions.NUM.get(cli)); } SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(inputPath)); LongWritable key = new LongWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; (n < 0 || i < n) && reader.next(key, value); ++i) { long millis = Long.divideUnsigned(key.get(), 1000000); String ts = DATE_FORMAT.format(new Date(millis)); for (PacketInfo pi : PcapHelper.toPacketInfo(value.copyBytes())) { EnumMap<Constants.Fields, Object> result = PcapHelper.packetToFields(pi); List<String> fieldResults = new ArrayList<String>() { { add("TS: " + ts); } }; for (Constants.Fields field : Constants.Fields.values()) { if (result.containsKey(field)) { fieldResults.add(field.getName() + ": " + result.get(field)); } } System.out.println(Joiner.on(",").join(fieldResults)); } } }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics start: " + crawlDb); }/*from ww w .j av a2 s .co m*/ Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(config); job.setJobName("stats " + crawlDb); job.setBoolean("db.reader.stats.sort", sort); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbStatMapper.class); job.setCombinerClass(CrawlDbStatCombiner.class); job.setReducerClass(CrawlDbStatReducer.class); FileOutputFormat.setOutputPath(job, tmpFolder); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // https://issues.apache.org/jira/browse/NUTCH-1029 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); JobClient.runJob(job); // reading the result FileSystem fileSystem = FileSystem.get(config); SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder); Text key = new Text(); LongWritable value = new LongWritable(); TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); if (k.equals("scx")) val.set(Long.MIN_VALUE); if (k.equals("scn")) val.set(Long.MAX_VALUE); stats.put(k, val); } if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } if (LOG.isInfoEnabled()) { LOG.info("Statistics for CrawlDb: " + crawlDb); LongWritable totalCnt = stats.get("T"); stats.remove("T"); LOG.info("TOTAL urls:\t" + totalCnt.get()); for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { LOG.info("min score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("scx")) { LOG.info("max score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("sct")) { LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) LOG.info(" " + st[2] + " :\t" + val); else LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val); } else LOG.info(k + ":\t" + val); } } // removing the tmp folder fileSystem.delete(tmpFolder, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } }
From source file:org.apache.nutch.crawl.WebTableReader.java
License:Apache License
@Override public Map<String, Object> run(Map<String, Object> args) throws Exception { Path tmpFolder = new Path(getConf().get("mapred.temp.dir", ".") + "stat_tmp" + System.currentTimeMillis()); numJobs = 1;/*from w ww .j ava 2 s .c om*/ currentJob = new NutchJob(getConf(), "db_stats"); currentJob.getConfiguration().setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Boolean sort = (Boolean) args.get(Nutch.ARG_SORT); if (sort == null) sort = Boolean.FALSE; currentJob.getConfiguration().setBoolean("db.reader.stats.sort", sort); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); Query<String, WebPage> query = store.newQuery(); // remove the __g__dirty field since it is not stored String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1, WebPage._ALL_FIELDS.length); query.setFields(fields); GoraMapper.initMapperJob(currentJob, query, store, Text.class, LongWritable.class, WebTableStatMapper.class, null, true); currentJob.setCombinerClass(WebTableStatCombiner.class); currentJob.setReducerClass(WebTableStatReducer.class); FileOutputFormat.setOutputPath(currentJob, tmpFolder); currentJob.setOutputFormatClass(SequenceFileOutputFormat.class); currentJob.setOutputKeyClass(Text.class); currentJob.setOutputValueClass(LongWritable.class); FileSystem fileSystem = FileSystem.get(getConf()); try { currentJob.waitForCompletion(true); } finally { ToolUtil.recordJobStatus(null, currentJob, results); if (!currentJob.isSuccessful()) { fileSystem.delete(tmpFolder, true); return results; } } Text key = new Text(); LongWritable value = new LongWritable(); SequenceFile.Reader[] readers = org.apache.hadoop.mapred.SequenceFileOutputFormat.getReaders(getConf(), tmpFolder); TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); if (k.equals("scx")) val.set(Long.MIN_VALUE); if (k.equals("scn")) val.set(Long.MAX_VALUE); stats.put(k, val); } if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } LongWritable totalCnt = stats.get("T"); if (totalCnt == null) totalCnt = new LongWritable(0); stats.remove("T"); results.put("TOTAL urls", totalCnt.get()); for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { results.put("min score", (val.get() / 1000.0f)); } else if (k.equals("scx")) { results.put("max score", (val.get() / 1000.0f)); } else if (k.equals("sct")) { results.put("avg score", (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) results.put(st[2], val.get()); else results.put(st[0] + " " + code + " (" + CrawlStatus.getName((byte) code) + ")", val.get()); } else results.put(k, val.get()); } // removing the tmp folder fileSystem.delete(tmpFolder, true); return results; }
From source file:org.apache.nutch.mapreduce.WebTableReader.java
License:Apache License
@Override protected void doRun(Map<String, Object> args) throws Exception { Path tmpFolder = new Path(getConf().get("mapred.temp.dir", ".") + "stat_tmp" + System.currentTimeMillis()); currentJob.getConfiguration().setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Boolean sort = (Boolean) args.get(Nutch.ARG_SORT); if (sort == null) sort = Boolean.FALSE;// www .j a va 2 s . c o m currentJob.getConfiguration().setBoolean("db.reader.stats.sort", sort); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); Query<String, WebPage> query = store.newQuery(); // remove the __g__dirty field since it is not stored String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1, WebPage._ALL_FIELDS.length); query.setFields(fields); GoraMapper.initMapperJob(currentJob, query, store, Text.class, LongWritable.class, WebTableStatMapper.class, null, true); currentJob.setCombinerClass(WebTableStatCombiner.class); currentJob.setReducerClass(WebTableStatReducer.class); FileOutputFormat.setOutputPath(currentJob, tmpFolder); currentJob.setOutputFormatClass(SequenceFileOutputFormat.class); currentJob.setOutputKeyClass(Text.class); currentJob.setOutputValueClass(LongWritable.class); FileSystem fileSystem = FileSystem.get(getConf()); try { currentJob.waitForCompletion(true); } finally { if (!currentJob.isSuccessful()) { fileSystem.delete(tmpFolder, true); return; } } Text key = new Text(); LongWritable value = new LongWritable(); SequenceFile.Reader[] readers = org.apache.hadoop.mapred.SequenceFileOutputFormat.getReaders(getConf(), tmpFolder); TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); if (k.equals("scx")) val.set(Long.MIN_VALUE); if (k.equals("scn")) val.set(Long.MAX_VALUE); stats.put(k, val); } if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } LongWritable totalCnt = stats.get("T"); if (totalCnt == null) totalCnt = new LongWritable(0); stats.remove("T"); results.put("TOTAL urls", totalCnt.get()); for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { results.put("min score", (val.get() / 1000.0f)); } else if (k.equals("scx")) { results.put("max score", (val.get() / 1000.0f)); } else if (k.equals("sct")) { results.put("avg score", (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) results.put(st[2], val.get()); else results.put(st[0] + " " + code + " (" + CrawlStatus.getName((byte) code) + ")", val.get()); } else results.put(k, val.get()); } // removing the tmp folder fileSystem.delete(tmpFolder, true); }
From source file:org.apache.orc.mapred.TestOrcOutputFormat.java
License:Apache License
/** * Test the case where the top level isn't a struct, but a long. *//*from w w w. j ava 2 s . c o m*/ @Test public void testLongRoot() throws Exception { conf.set("mapreduce.task.attempt.id", "attempt_20160101_0001_m_000001_0"); conf.setOutputCommitter(NullOutputCommitter.class); conf.set(OrcConf.COMPRESS.getAttribute(), "SNAPPY"); conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000); conf.setInt(OrcConf.BUFFER_SIZE.getAttribute(), 64 * 1024); conf.set(OrcConf.WRITE_FORMAT.getAttribute(), "0.11"); final String typeStr = "bigint"; OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr); FileOutputFormat.setOutputPath(conf, workDir); TypeDescription type = TypeDescription.fromString(typeStr); LongWritable value = new LongWritable(); NullWritable nada = NullWritable.get(); RecordWriter<NullWritable, LongWritable> writer = new OrcOutputFormat<LongWritable>().getRecordWriter(fs, conf, "long.orc", Reporter.NULL); for (long lo = 0; lo < 2000; ++lo) { value.set(lo); writer.write(nada, value); } writer.close(Reporter.NULL); Path path = new Path(workDir, "long.orc"); Reader file = OrcFile.createReader(path, OrcFile.readerOptions(conf)); assertEquals(CompressionKind.SNAPPY, file.getCompressionKind()); assertEquals(2000, file.getNumberOfRows()); assertEquals(1000, file.getRowIndexStride()); assertEquals(64 * 1024, file.getCompressionSize()); assertEquals(OrcFile.Version.V_0_11, file.getFileVersion()); FileSplit split = new FileSplit(path, 0, 100000, new String[0]); RecordReader<NullWritable, LongWritable> reader = new OrcInputFormat<LongWritable>().getRecordReader(split, conf, Reporter.NULL); nada = reader.createKey(); value = reader.createValue(); for (long lo = 0; lo < 2000; ++lo) { assertEquals(true, reader.next(nada, value)); assertEquals(lo, value.get()); } assertEquals(false, reader.next(nada, value)); }