List of usage examples for org.apache.hadoop.io IntWritable get
public int get()
From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java
License:Apache License
private Matrix readMatrix(Path dir) throws IOException { Matrix matrix = new SparseMatrix(new int[] { Integer.MAX_VALUE, Integer.MAX_VALUE }); FileSystem fs = dir.getFileSystem(getConf()); for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) { Path path = seqFile.getPath(); SequenceFile.Reader reader = null; try {//from w w w. j a v a 2 s .c o m reader = new SequenceFile.Reader(fs, path, getConf()); IntWritable key = new IntWritable(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { int row = key.get(); Iterator<Vector.Element> elementsIterator = value.get().iterateNonZero(); while (elementsIterator.hasNext()) { Vector.Element element = elementsIterator.next(); matrix.set(row, element.index(), element.get()); } } } finally { IOUtils.quietClose(reader); } } return matrix; }
From source file:org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); try {/*from ww w . ja va 2 s . co m*/ dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize); URI[] localFiles = DistributedCache.getCacheFiles(conf); if (localFiles == null || localFiles.length < 1) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } Path dictionaryFile = new Path(localFiles[0].getPath()); FileSystem fs = dictionaryFile.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf); Writable key = new Text(); IntWritable value = new IntWritable(); // key is word value is id while (reader.next(key, value)) { dictionary.put(key.toString(), value.get()); } } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java
License:Apache License
/** * Read the document frequency List which is built at the end of the DF Count Job. This will use constant * memory and will run at the speed of your disk read * /*from w w w .j av a2s. c o m*/ * @param featureCountPath * @param dictionaryPathBase * @throws IOException */ private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = new ArrayList<Path>(); IntWritable key = new IntWritable(); LongWritable value = new LongWritable(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath, OUTPUT_FILES_PATTERN)); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // key is feature value is count while (reader.next(key, value)) { if (currentChunkSize > chunkSizeLimit) { freqWriter.close(); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } } featureCount++; freqWriter.close(); Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); }
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); try {/*from www . j ava 2 s.com*/ Configuration conf = context.getConfiguration(); URI[] localFiles = DistributedCache.getCacheFiles(conf); if (localFiles == null || localFiles.length < 1) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); Path dictionaryFile = new Path(localFiles[0].getPath()); FileSystem fs = dictionaryFile.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf); IntWritable key = new IntWritable(); LongWritable value = new LongWritable(); // key is feature, value is the document frequency while (reader.next(key, value)) { dictionary.put(key.get(), value.get()); } } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.apache.mahout.vectorizer.tfidf.TFIDFConverter.java
License:Apache License
/** * Read the document frequency List which is built at the end of the DF Count Job. This will use constant * memory and will run at the speed of your disk read *///from w w w .j ava 2 s. co m private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); try { long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>( filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(freqWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; IntWritable key = record.getFirst(); LongWritable value = record.getSecond(); if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } featureCount++; Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); } finally { Closeables.close(freqWriter, false); } }
From source file:org.apache.metron.integration.PcapParserIntegrationTest.java
License:Apache License
private static Map<String, byte[]> readPcaps(Path pcapFile) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(), Reader.file(pcapFile)); Map<String, byte[]> ret = new HashMap<>(); IntWritable key = new IntWritable(); BytesWritable value = new BytesWritable(); PcapParser parser = new PcapParser(); parser.init();//from w w w .j a va2 s .c om while (reader.next(key, value)) { int keyInt = key.get(); byte[] valueBytes = value.copyBytes(); JSONObject message = parser.parse(valueBytes).get(0); if (parser.validate(message)) { ret.put(PcapUtils.getSessionKey(message), valueBytes); } } return ret; }
From source file:org.apache.nutch.indexer.DeleteDuplicates.java
License:Apache License
/** Delete docs named in values from index named in key. */ public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { Path index = new Path(key.toString()); IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf())); try {/*from ww w .j av a2 s . c o m*/ while (values.hasNext()) { IntWritable value = (IntWritable) values.next(); LOG.debug("-delete " + index + " doc=" + value); reader.deleteDocument(value.get()); } } finally { reader.close(); } }
From source file:org.apache.nutch.scoring.depth.DepthScoringFilter.java
License:Apache License
@Override public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException { // boost up by current depth int curDepth, curMaxDepth; IntWritable maxDepth = (IntWritable) datum.getMetaData().get(MAX_DEPTH_KEY_W); if (maxDepth != null) { curMaxDepth = maxDepth.get(); } else {/* w w w. j a va2 s . co m*/ curMaxDepth = defaultMaxDepth; } IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W); if (depth == null) { // penalize curDepth = curMaxDepth; } else { curDepth = depth.get(); } int mul = curMaxDepth - curDepth; return initSort * (1 + mul); }
From source file:org.apache.nutch.scoring.depth.DepthScoringFilter.java
License:Apache License
@Override public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException { // find a minimum of all depths int newDepth = DEFAULT_MAX_DEPTH; if (old != null) { IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W); if (oldDepth != null) { newDepth = oldDepth.get(); } else {/*w w w . ja va 2 s. c o m*/ // not set ? initialScore(url, old); } } for (CrawlDatum lnk : inlinked) { IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W); if (depth != null && depth.get() < newDepth) { newDepth = depth.get(); } } datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth)); }
From source file:org.apache.nutch.scoring.orphan.OrphanScoringFilter.java
License:Apache License
public void orphanedScore(Text url, CrawlDatum datum) { // Already has an orphaned time? if (datum.getMetaData().containsKey(ORPHAN_KEY_WRITABLE)) { // Get the last time this hyperlink was inlinked IntWritable writable = (IntWritable) datum.getMetaData().get(ORPHAN_KEY_WRITABLE); int lastInlinkTime = writable.get(); int now = (int) (System.currentTimeMillis() / 1000); int elapsedSinceLastInLinkTime = now - lastInlinkTime; if (elapsedSinceLastInLinkTime > markOrphanAfter) { // Mark as orphan so we can permanently delete it datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN); } else if (elapsedSinceLastInLinkTime > markGoneAfter) { // Mark as gone so the indexer can remove it datum.setStatus(CrawlDatum.STATUS_DB_GONE); }/*from w w w . java 2s . c o m*/ } }