Example usage for org.apache.hadoop.io IntWritable IntWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io IntWritable IntWritable.

Prototype

public IntWritable()

Source Link

Usage

From source file:edu.indiana.d2i.htrc.util.Utilities.java

License:Apache License

public static void Dictionary2SeqFile(String input, String output) throws IOException {
    BufferedReader reader = new BufferedReader(new FileReader(input));

    Configuration conf = new Configuration();
    SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, new Path(output),
            Text.class, IntWritable.class);

    String line = null;/* ww  w .ja  va  2 s  .c  om*/
    Text key = new Text();
    IntWritable value = new IntWritable();
    int count = 0;
    while ((line = reader.readLine()) != null) {
        key.set(line);
        value.set(count++);
        writer.append(key, value);
    }

    writer.close();
    reader.close();
}

From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java

License:Apache License

@Override
public ClueWarcRecord getDocument(int docno) {
    long start = System.currentTimeMillis();

    // Trap invalid docnos.
    if (docno < getFirstDocno() || docno > getLastDocno()) {
        return null;
    }//from   w  ww. ja  v  a 2  s  . co m

    int idx = Arrays.binarySearch(docnos, docno);

    if (idx < 0) {
        idx = -idx - 2;
    }

    DecimalFormat df = new DecimalFormat("00000");
    String file = collectionPath + "/part-" + df.format(fileno[idx]);

    LOG.info("fetching docno " + docno + ": seeking to " + offsets[idx] + " at " + file);

    try {
        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file)));

        IntWritable key = new IntWritable();
        ClueWarcRecord value = new ClueWarcRecord();

        reader.seek(offsets[idx]);

        while (reader.next(key)) {
            if (key.get() == docno) {
                break;
            }
        }

        reader.getCurrentValue(value);
        reader.close();

        long duration = System.currentTimeMillis() - start;

        LOG.info(" docno " + docno + " fetched in " + duration + "ms");
        return value;
    } catch (IOException e) {
        e.printStackTrace();
    }

    return null;
}

From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java

License:Apache License

@Override
public int getLastDocno() {
    if (lastDocno != -1) {
        return lastDocno;
    }//  w w w . j a  v a  2  s  . c o  m

    // Find the last entry, and then see all the way to the end of the collection.
    int idx = docnos.length - 1;

    String file = collectionPath + "/part-" + FORMAT5.format(fileno[idx]);

    try {
        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file)));
        IntWritable key = new IntWritable();

        reader.seek(offsets[idx]);

        while (reader.next(key))
            ;
        lastDocno = key.get();
        reader.close();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return lastDocno;
}

From source file:edu.umd.cloud9.collection.clue.ScanBlockCompressedSequenceFile.java

License:Apache License

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.out.println("usage: [SequenceFile]");
        System.exit(-1);//from   w  ww.ja  v a  2s  . c o  m
    }

    List<Long> seekPoints = Lists.newArrayList();
    long pos = -1;
    long prevPos = -1;

    int prevDocno = 0;

    Path path = new Path(args[0]);
    Configuration config = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(path));

    IntWritable key = new IntWritable();
    ClueWarcRecord value = new ClueWarcRecord();

    pos = reader.getPosition();
    int cnt = 0;
    while (reader.next(key, value)) {
        if (prevPos != -1 && prevPos != pos) {
            System.out.println("## beginning of block at " + prevPos + ", docno:" + prevDocno);
            seekPoints.add(prevPos);
        }

        System.out.println("offset:" + pos + "\tdocno:" + key + "\tdocid:" + value.getDocid());

        prevPos = pos;
        pos = reader.getPosition();
        prevDocno = key.get();

        cnt++;

        if (cnt > Integer.MAX_VALUE)
            break;
    }

    reader.close();

    reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(path));

    for (long p : seekPoints) {
        reader.seek(p);
        reader.next(key, value);
        System.out.println("seeking to pos " + p + "\tdocno:" + key + "\tdocid:" + value.getDocid());
    }

    reader.close();
}

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task3(String inputPath, String redirectPath, String outputPath) throws IOException {

    // caches/*from  w  ww  . ja va2 s .c om*/
    IntWritable mapKey = new IntWritable();
    HMapSIW mapVal = new HMapSIW();
    HMapSIW tmpMap = new HMapSIW();
    IntWritable target = new IntWritable(0);

    // read the redirect file
    MapFile.Reader redirectReader = null;
    MapFile.Writer mapWriter = null;
    MapFile.Reader mapReader = null;

    try {
        mapReader = new MapFile.Reader(new Path(inputPath + "/part-r-00000"), getConf());

        redirectReader = new MapFile.Reader(new Path(redirectPath), getConf());

        // TODO: Change code here
        mapWriter = new MapFile.Writer(getConf(), new Path(outputPath),
                MapFile.Writer.keyClass(IntWritable.class), MapFile.Writer.valueClass(HMapSIW.class));

        while (mapReader.next(mapKey, mapVal)) {
            redirectReader.get(mapKey, target);
            if (target.get() > 0) {
                mapReader.get(target, tmpMap);
                if (!tmpMap.isEmpty()) {
                    tmpMap.putAll(mapVal);
                    mapWriter.append(target, tmpMap);
                }
            } else {
                mapWriter.append(mapKey, mapVal);
            }
        }
    } finally {
        if (mapWriter != null)
            mapWriter.close();
        if (mapReader != null)
            mapReader.close();
        if (redirectReader != null)
            redirectReader.close();

        // Clean up intermediate data.
        FileSystem.get(getConf()).delete(new Path(inputPath), true);
    }
}

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*  w  w  w  .j a v a2  s  .  co m*/
    }

    conf.setInt("NodeCount", n);

    Partitioner p = null;

    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }

    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }

    sLogger.info(sb.toString().trim());

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - numPartitions: " + numPartitions);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(FloatWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex.java

License:Apache License

public IndexableAnchorText getDocument(int docno) {
    int idx = Arrays.binarySearch(docnos, docno);

    if (idx < 0)
        idx = -idx - 2;//  www.ja  v  a2  s  .co  m

    DecimalFormat df = new DecimalFormat("00000");
    String file = collectionPath + "/part-" + df.format(filenos[idx]);

    try {

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf);

        IntWritable key = new IntWritable();
        ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();

        reader.seek(offsets[idx]);

        while (reader.next(key)) {
            if (key.get() == docno)
                break;
        }

        reader.getCurrentValue(value);
        reader.close();

        indexableAnchorText.createHTML(value);
        return indexableAnchorText;
    } catch (IOException e) {
        e.printStackTrace();
    }

    return null;
}

From source file:edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex.java

License:Apache License

public int getLastDocno() {
    if (mLastDocno != -1)
        return mLastDocno;

    // find the last entry, and then see all the way to the end of the
    // collection
    int idx = docnos.length - 1;

    String file = collectionPath + "/part-" + df.format(filenos[idx]);

    try {//from  ww  w  .  ja  va 2s.c  o  m
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf);
        IntWritable key = new IntWritable();

        reader.seek(offsets[idx]);

        while (reader.next(key))
            ;
        mLastDocno = key.get();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return mLastDocno;
}

From source file:edu.umd.shrawanraina.ExtractTopPersonalizedPageRankNodes.java

License:Apache License

@SuppressWarnings("deprecation")
private void extractTop(String inputPath, String outputPath, String sources, int n)
        throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException,
        InstantiationException, IllegalAccessException {
    // TODO Auto-generated method stub

    /*//w  w  w  .  j a  va  2s. c  om
    Configuration conf = getConf();
    conf.setStrings("sources", sources);
    conf.setInt(LIMIT, n);
    Job job = Job.getInstance(conf);
    job.setJobName(ExtractTopPersonalizedPageRankNodes.class.getName()
    + ":" + inputPath);
    job.setJarByClass(ExtractTopPersonalizedPageRankNodes.class);
            
    job.setNumReduceTasks(1);
            
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath + "-" + "Top"));
            
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
            
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);
            
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(FloatWritable.class);
            
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
            
    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath + "-" + "Top"), true);
            
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in "
    + (System.currentTimeMillis() - startTime) / 1000.0
    + " seconds");
    */

    Configuration conf = new Configuration();
    Path in = new Path(inputPath);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(in + "/_SUCCESS"), true);
    List<TopScoredObjects<Integer>> queueList = new ArrayList<TopScoredObjects<Integer>>();
    List<String> sourceList = Arrays.asList(sources.split(","));

    for (int i = 0; i < sourceList.size(); i++)
        queueList.add(i, new TopScoredObjects<Integer>(n));
    //System.out.println("Source : <<<<<<<"+sourceList.size());
    FileStatus[] fss = fs.listStatus(new Path(in + "/"));
    for (FileStatus status : fss) {
        Path path = status.getPath();
        //System.out.println("Path: <<<<<<< "+ path);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        IntWritable key = new IntWritable();
        PageRankNodeUpd value = new PageRankNodeUpd();
        while (reader.next(key, value)) {
            for (int i = 0; i < sourceList.size(); i++) {
                queueList.get(i).add(key.get(), value.getPageRankList().get(i));
                //System.out.println(key.get() + " | " + value.getPageRankList().get(i));
            }

        }
        reader.close();
    }
    //System.out.println("List : <<<<<<<"+queueList.size());
    for (int i = 0; i < sourceList.size(); i++) {
        TopScoredObjects<Integer> queue = queueList.get(i);
        System.out.println("Source : <<<<<<<" + sourceList.get(i));
        for (PairOfObjectFloat<Integer> pair : queue.extractAll()) {
            int nodeid = ((Integer) pair.getLeftElement());
            float pagerank = (float) Math.exp(pair.getRightElement());
            System.out.println(String.format("%.5f %d", pagerank, nodeid));
        }
    }
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

public static <S1 extends Shape, S2 extends Shape> int SpatialJoin_planeSweepFilterOnly(final List<S1> R,
        final List<S2> S, final ResultCollector2<S1, S2> output, Reporter reporter) throws IOException {

    LOG.debug("Start spatial join plan sweep algorithm !!!");

    final RectangleID[] Rmbrs = new RectangleID[R.size()];
    for (int i = 0; i < R.size(); i++) {
        Rmbrs[i] = new RectangleID(i, R.get(i).getMBR());
    }//from w w w . jav a  2 s.  c  o  m
    final RectangleID[] Smbrs = new RectangleID[S.size()];
    for (int i = 0; i < S.size(); i++) {
        Smbrs[i] = new RectangleID(i, S.get(i).getMBR());
    }

    final IntWritable count = new IntWritable();
    int filterCount = SpatialJoin_rectangles(Rmbrs, Smbrs, new OutputCollector<RectangleID, RectangleID>() {
        @Override
        public void collect(RectangleID r1, RectangleID r2) throws IOException {
            //if (R.get(r1.id).isIntersected(S.get(r2.id))) {
            if (output != null)
                output.collect(R.get(r1.id), S.get(r2.id));
            count.set(count.get() + 1);
            //}
        }
    }, reporter);

    LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get());

    return count.get();
}