List of usage examples for org.apache.hadoop.io IntWritable IntWritable
public IntWritable()
From source file:edu.indiana.d2i.htrc.util.Utilities.java
License:Apache License
public static void Dictionary2SeqFile(String input, String output) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(input)); Configuration conf = new Configuration(); SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, new Path(output), Text.class, IntWritable.class); String line = null;/* ww w .ja va 2 s .c om*/ Text key = new Text(); IntWritable value = new IntWritable(); int count = 0; while ((line = reader.readLine()) != null) { key.set(line); value.set(count++); writer.append(key, value); } writer.close(); reader.close(); }
From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java
License:Apache License
@Override public ClueWarcRecord getDocument(int docno) { long start = System.currentTimeMillis(); // Trap invalid docnos. if (docno < getFirstDocno() || docno > getLastDocno()) { return null; }//from w ww. ja v a 2 s . co m int idx = Arrays.binarySearch(docnos, docno); if (idx < 0) { idx = -idx - 2; } DecimalFormat df = new DecimalFormat("00000"); String file = collectionPath + "/part-" + df.format(fileno[idx]); LOG.info("fetching docno " + docno + ": seeking to " + offsets[idx] + " at " + file); try { SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file))); IntWritable key = new IntWritable(); ClueWarcRecord value = new ClueWarcRecord(); reader.seek(offsets[idx]); while (reader.next(key)) { if (key.get() == docno) { break; } } reader.getCurrentValue(value); reader.close(); long duration = System.currentTimeMillis() - start; LOG.info(" docno " + docno + " fetched in " + duration + "ms"); return value; } catch (IOException e) { e.printStackTrace(); } return null; }
From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java
License:Apache License
@Override public int getLastDocno() { if (lastDocno != -1) { return lastDocno; }// w w w . j a v a 2 s . c o m // Find the last entry, and then see all the way to the end of the collection. int idx = docnos.length - 1; String file = collectionPath + "/part-" + FORMAT5.format(fileno[idx]); try { SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file))); IntWritable key = new IntWritable(); reader.seek(offsets[idx]); while (reader.next(key)) ; lastDocno = key.get(); reader.close(); } catch (IOException e) { e.printStackTrace(); } return lastDocno; }
From source file:edu.umd.cloud9.collection.clue.ScanBlockCompressedSequenceFile.java
License:Apache License
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("usage: [SequenceFile]"); System.exit(-1);//from w ww.ja v a 2s . c o m } List<Long> seekPoints = Lists.newArrayList(); long pos = -1; long prevPos = -1; int prevDocno = 0; Path path = new Path(args[0]); Configuration config = new Configuration(); SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(path)); IntWritable key = new IntWritable(); ClueWarcRecord value = new ClueWarcRecord(); pos = reader.getPosition(); int cnt = 0; while (reader.next(key, value)) { if (prevPos != -1 && prevPos != pos) { System.out.println("## beginning of block at " + prevPos + ", docno:" + prevDocno); seekPoints.add(prevPos); } System.out.println("offset:" + pos + "\tdocno:" + key + "\tdocid:" + value.getDocid()); prevPos = pos; pos = reader.getPosition(); prevDocno = key.get(); cnt++; if (cnt > Integer.MAX_VALUE) break; } reader.close(); reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(path)); for (long p : seekPoints) { reader.seek(p); reader.next(key, value); System.out.println("seeking to pos " + p + "\tdocno:" + key + "\tdocid:" + value.getDocid()); } reader.close(); }
From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java
License:Apache License
private void task3(String inputPath, String redirectPath, String outputPath) throws IOException { // caches/*from w ww . ja va2 s .c om*/ IntWritable mapKey = new IntWritable(); HMapSIW mapVal = new HMapSIW(); HMapSIW tmpMap = new HMapSIW(); IntWritable target = new IntWritable(0); // read the redirect file MapFile.Reader redirectReader = null; MapFile.Writer mapWriter = null; MapFile.Reader mapReader = null; try { mapReader = new MapFile.Reader(new Path(inputPath + "/part-r-00000"), getConf()); redirectReader = new MapFile.Reader(new Path(redirectPath), getConf()); // TODO: Change code here mapWriter = new MapFile.Writer(getConf(), new Path(outputPath), MapFile.Writer.keyClass(IntWritable.class), MapFile.Writer.valueClass(HMapSIW.class)); while (mapReader.next(mapKey, mapVal)) { redirectReader.get(mapKey, target); if (target.get() > 0) { mapReader.get(target, tmpMap); if (!tmpMap.isEmpty()) { tmpMap.putAll(mapVal); mapWriter.append(target, tmpMap); } } else { mapWriter.append(mapKey, mapVal); } } } finally { if (mapWriter != null) mapWriter.close(); if (mapReader != null) mapReader.close(); if (redirectReader != null) redirectReader.close(); // Clean up intermediate data. FileSystem.get(getConf()).delete(new Path(inputPath), true); } }
From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); String in = path + "/iter" + sFormat.format(i); String out = path + "/iter" + sFormat.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // we need to actually count the number of part files to get the number // of partitions (because the directory might contain _log) int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;/* w w w .j a v a2 s . co m*/ } conf.setInt("NodeCount", n); Partitioner p = null; if (useRange) { p = new RangePartitioner<IntWritable, Writable>(); p.configure(conf); } else { p = new HashPartitioner<WritableComparable, Writable>(); } // this is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (f.getPath().getName().contains("_logs")) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); sLogger.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + "\t"); } sLogger.info(sb.toString().trim()); sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); sLogger.info(" - nodeCnt: " + n); sLogger.info(" - useCombiner: " + useCombiner); sLogger.info(" - useInmapCombiner: " + useInmapCombiner); sLogger.info(" - numPartitions: " + numPartitions); sLogger.info(" - useRange: " + useRange); sLogger.info("computed number of partitions: " + numPartitions); int numMapTasks = numPartitions; int numReduceTasks = numPartitions; conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { conf.setMapperClass(MapWithInMapperCombiningClass.class); } else { conf.setMapperClass(MapClass.class); } if (useCombiner) { conf.setCombinerClass(CombineClass.class); } if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); JobClient.runJob(conf); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex.java
License:Apache License
public IndexableAnchorText getDocument(int docno) { int idx = Arrays.binarySearch(docnos, docno); if (idx < 0) idx = -idx - 2;// www.ja v a2 s .co m DecimalFormat df = new DecimalFormat("00000"); String file = collectionPath + "/part-" + df.format(filenos[idx]); try { SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf); IntWritable key = new IntWritable(); ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>(); reader.seek(offsets[idx]); while (reader.next(key)) { if (key.get() == docno) break; } reader.getCurrentValue(value); reader.close(); indexableAnchorText.createHTML(value); return indexableAnchorText; } catch (IOException e) { e.printStackTrace(); } return null; }
From source file:edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex.java
License:Apache License
public int getLastDocno() { if (mLastDocno != -1) return mLastDocno; // find the last entry, and then see all the way to the end of the // collection int idx = docnos.length - 1; String file = collectionPath + "/part-" + df.format(filenos[idx]); try {//from ww w . ja va 2s.c o m SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf); IntWritable key = new IntWritable(); reader.seek(offsets[idx]); while (reader.next(key)) ; mLastDocno = key.get(); } catch (IOException e) { e.printStackTrace(); } return mLastDocno; }
From source file:edu.umd.shrawanraina.ExtractTopPersonalizedPageRankNodes.java
License:Apache License
@SuppressWarnings("deprecation") private void extractTop(String inputPath, String outputPath, String sources, int n) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException, InstantiationException, IllegalAccessException { // TODO Auto-generated method stub /*//w w w . j a va 2s. c om Configuration conf = getConf(); conf.setStrings("sources", sources); conf.setInt(LIMIT, n); Job job = Job.getInstance(conf); job.setJobName(ExtractTopPersonalizedPageRankNodes.class.getName() + ":" + inputPath); job.setJarByClass(ExtractTopPersonalizedPageRankNodes.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath + "-" + "Top")); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(FloatWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath + "-" + "Top"), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); */ Configuration conf = new Configuration(); Path in = new Path(inputPath); FileSystem fs = FileSystem.get(conf); fs.delete(new Path(in + "/_SUCCESS"), true); List<TopScoredObjects<Integer>> queueList = new ArrayList<TopScoredObjects<Integer>>(); List<String> sourceList = Arrays.asList(sources.split(",")); for (int i = 0; i < sourceList.size(); i++) queueList.add(i, new TopScoredObjects<Integer>(n)); //System.out.println("Source : <<<<<<<"+sourceList.size()); FileStatus[] fss = fs.listStatus(new Path(in + "/")); for (FileStatus status : fss) { Path path = status.getPath(); //System.out.println("Path: <<<<<<< "+ path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); IntWritable key = new IntWritable(); PageRankNodeUpd value = new PageRankNodeUpd(); while (reader.next(key, value)) { for (int i = 0; i < sourceList.size(); i++) { queueList.get(i).add(key.get(), value.getPageRankList().get(i)); //System.out.println(key.get() + " | " + value.getPageRankList().get(i)); } } reader.close(); } //System.out.println("List : <<<<<<<"+queueList.size()); for (int i = 0; i < sourceList.size(); i++) { TopScoredObjects<Integer> queue = queueList.get(i); System.out.println("Source : <<<<<<<" + sourceList.get(i)); for (PairOfObjectFloat<Integer> pair : queue.extractAll()) { int nodeid = ((Integer) pair.getLeftElement()); float pagerank = (float) Math.exp(pair.getRightElement()); System.out.println(String.format("%.5f %d", pagerank, nodeid)); } } }
From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java
License:Open Source License
public static <S1 extends Shape, S2 extends Shape> int SpatialJoin_planeSweepFilterOnly(final List<S1> R, final List<S2> S, final ResultCollector2<S1, S2> output, Reporter reporter) throws IOException { LOG.debug("Start spatial join plan sweep algorithm !!!"); final RectangleID[] Rmbrs = new RectangleID[R.size()]; for (int i = 0; i < R.size(); i++) { Rmbrs[i] = new RectangleID(i, R.get(i).getMBR()); }//from w w w . jav a 2 s. c o m final RectangleID[] Smbrs = new RectangleID[S.size()]; for (int i = 0; i < S.size(); i++) { Smbrs[i] = new RectangleID(i, S.get(i).getMBR()); } final IntWritable count = new IntWritable(); int filterCount = SpatialJoin_rectangles(Rmbrs, Smbrs, new OutputCollector<RectangleID, RectangleID>() { @Override public void collect(RectangleID r1, RectangleID r2) throws IOException { //if (R.get(r1.id).isIntersected(S.get(r2.id))) { if (output != null) output.collect(R.get(r1.id), S.get(r2.id)); count.set(count.get() + 1); //} } }, reporter); LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get()); return count.get(); }