Example usage for org.apache.hadoop.io IntWritable IntWritable

List of usage examples for org.apache.hadoop.io IntWritable IntWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io IntWritable IntWritable.

Prototype

public IntWritable() 

Source Link

Usage

From source file:edu.indiana.d2i.htrc.util.Utilities.java

License:Apache License

public static void Dictionary2SeqFile(String input, String output) throws IOException {
    BufferedReader reader = new BufferedReader(new FileReader(input));

    Configuration conf = new Configuration();
    SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, new Path(output),
            Text.class, IntWritable.class);

    String line = null;/* ww  w .ja  va  2 s  .c  om*/
    Text key = new Text();
    IntWritable value = new IntWritable();
    int count = 0;
    while ((line = reader.readLine()) != null) {
        key.set(line);
        value.set(count++);
        writer.append(key, value);
    }

    writer.close();
    reader.close();
}

From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java

License:Apache License

@Override
public ClueWarcRecord getDocument(int docno) {
    long start = System.currentTimeMillis();

    // Trap invalid docnos.
    if (docno < getFirstDocno() || docno > getLastDocno()) {
        return null;
    }//from   w  ww. ja  v  a 2  s  . co m

    int idx = Arrays.binarySearch(docnos, docno);

    if (idx < 0) {
        idx = -idx - 2;
    }

    DecimalFormat df = new DecimalFormat("00000");
    String file = collectionPath + "/part-" + df.format(fileno[idx]);

    LOG.info("fetching docno " + docno + ": seeking to " + offsets[idx] + " at " + file);

    try {
        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file)));

        IntWritable key = new IntWritable();
        ClueWarcRecord value = new ClueWarcRecord();

        reader.seek(offsets[idx]);

        while (reader.next(key)) {
            if (key.get() == docno) {
                break;
            }
        }

        reader.getCurrentValue(value);
        reader.close();

        long duration = System.currentTimeMillis() - start;

        LOG.info(" docno " + docno + " fetched in " + duration + "ms");
        return value;
    } catch (IOException e) {
        e.printStackTrace();
    }

    return null;
}

From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java

License:Apache License

@Override
public int getLastDocno() {
    if (lastDocno != -1) {
        return lastDocno;
    }//  w w w . j a  v a  2  s  . c o  m

    // Find the last entry, and then see all the way to the end of the collection.
    int idx = docnos.length - 1;

    String file = collectionPath + "/part-" + FORMAT5.format(fileno[idx]);

    try {
        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file)));
        IntWritable key = new IntWritable();

        reader.seek(offsets[idx]);

        while (reader.next(key))
            ;
        lastDocno = key.get();
        reader.close();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return lastDocno;
}

From source file:edu.umd.cloud9.collection.clue.ScanBlockCompressedSequenceFile.java

License:Apache License

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.out.println("usage: [SequenceFile]");
        System.exit(-1);//from   w  ww.ja  v a  2s  . c o  m
    }

    List<Long> seekPoints = Lists.newArrayList();
    long pos = -1;
    long prevPos = -1;

    int prevDocno = 0;

    Path path = new Path(args[0]);
    Configuration config = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(path));

    IntWritable key = new IntWritable();
    ClueWarcRecord value = new ClueWarcRecord();

    pos = reader.getPosition();
    int cnt = 0;
    while (reader.next(key, value)) {
        if (prevPos != -1 && prevPos != pos) {
            System.out.println("## beginning of block at " + prevPos + ", docno:" + prevDocno);
            seekPoints.add(prevPos);
        }

        System.out.println("offset:" + pos + "\tdocno:" + key + "\tdocid:" + value.getDocid());

        prevPos = pos;
        pos = reader.getPosition();
        prevDocno = key.get();

        cnt++;

        if (cnt > Integer.MAX_VALUE)
            break;
    }

    reader.close();

    reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(path));

    for (long p : seekPoints) {
        reader.seek(p);
        reader.next(key, value);
        System.out.println("seeking to pos " + p + "\tdocno:" + key + "\tdocid:" + value.getDocid());
    }

    reader.close();
}

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task3(String inputPath, String redirectPath, String outputPath) throws IOException {

    // caches/*from  w  ww  . ja va2 s .c om*/
    IntWritable mapKey = new IntWritable();
    HMapSIW mapVal = new HMapSIW();
    HMapSIW tmpMap = new HMapSIW();
    IntWritable target = new IntWritable(0);

    // read the redirect file
    MapFile.Reader redirectReader = null;
    MapFile.Writer mapWriter = null;
    MapFile.Reader mapReader = null;

    try {
        mapReader = new MapFile.Reader(new Path(inputPath + "/part-r-00000"), getConf());

        redirectReader = new MapFile.Reader(new Path(redirectPath), getConf());

        // TODO: Change code here
        mapWriter = new MapFile.Writer(getConf(), new Path(outputPath),
                MapFile.Writer.keyClass(IntWritable.class), MapFile.Writer.valueClass(HMapSIW.class));

        while (mapReader.next(mapKey, mapVal)) {
            redirectReader.get(mapKey, target);
            if (target.get() > 0) {
                mapReader.get(target, tmpMap);
                if (!tmpMap.isEmpty()) {
                    tmpMap.putAll(mapVal);
                    mapWriter.append(target, tmpMap);
                }
            } else {
                mapWriter.append(mapKey, mapVal);
            }
        }
    } finally {
        if (mapWriter != null)
            mapWriter.close();
        if (mapReader != null)
            mapReader.close();
        if (redirectReader != null)
            redirectReader.close();

        // Clean up intermediate data.
        FileSystem.get(getConf()).delete(new Path(inputPath), true);
    }
}

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*  w  w  w  .j a v a2  s  .  co m*/
    }

    conf.setInt("NodeCount", n);

    Partitioner p = null;

    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }

    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }

    sLogger.info(sb.toString().trim());

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - numPartitions: " + numPartitions);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(FloatWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex.java

License:Apache License

public IndexableAnchorText getDocument(int docno) {
    int idx = Arrays.binarySearch(docnos, docno);

    if (idx < 0)
        idx = -idx - 2;//  www.ja  v  a2  s  .co  m

    DecimalFormat df = new DecimalFormat("00000");
    String file = collectionPath + "/part-" + df.format(filenos[idx]);

    try {

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf);

        IntWritable key = new IntWritable();
        ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();

        reader.seek(offsets[idx]);

        while (reader.next(key)) {
            if (key.get() == docno)
                break;
        }

        reader.getCurrentValue(value);
        reader.close();

        indexableAnchorText.createHTML(value);
        return indexableAnchorText;
    } catch (IOException e) {
        e.printStackTrace();
    }

    return null;
}

From source file:edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex.java

License:Apache License

public int getLastDocno() {
    if (mLastDocno != -1)
        return mLastDocno;

    // find the last entry, and then see all the way to the end of the
    // collection
    int idx = docnos.length - 1;

    String file = collectionPath + "/part-" + df.format(filenos[idx]);

    try {//from  ww  w  .  ja  va 2s.c  o  m
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf);
        IntWritable key = new IntWritable();

        reader.seek(offsets[idx]);

        while (reader.next(key))
            ;
        mLastDocno = key.get();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return mLastDocno;
}

From source file:edu.umd.shrawanraina.ExtractTopPersonalizedPageRankNodes.java

License:Apache License

@SuppressWarnings("deprecation")
private void extractTop(String inputPath, String outputPath, String sources, int n)
        throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException,
        InstantiationException, IllegalAccessException {
    // TODO Auto-generated method stub

    /*//w  w  w  .  j a  va  2s. c  om
    Configuration conf = getConf();
    conf.setStrings("sources", sources);
    conf.setInt(LIMIT, n);
    Job job = Job.getInstance(conf);
    job.setJobName(ExtractTopPersonalizedPageRankNodes.class.getName()
    + ":" + inputPath);
    job.setJarByClass(ExtractTopPersonalizedPageRankNodes.class);
            
    job.setNumReduceTasks(1);
            
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath + "-" + "Top"));
            
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
            
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);
            
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(FloatWritable.class);
            
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
            
    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath + "-" + "Top"), true);
            
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in "
    + (System.currentTimeMillis() - startTime) / 1000.0
    + " seconds");
    */

    Configuration conf = new Configuration();
    Path in = new Path(inputPath);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(in + "/_SUCCESS"), true);
    List<TopScoredObjects<Integer>> queueList = new ArrayList<TopScoredObjects<Integer>>();
    List<String> sourceList = Arrays.asList(sources.split(","));

    for (int i = 0; i < sourceList.size(); i++)
        queueList.add(i, new TopScoredObjects<Integer>(n));
    //System.out.println("Source : <<<<<<<"+sourceList.size());
    FileStatus[] fss = fs.listStatus(new Path(in + "/"));
    for (FileStatus status : fss) {
        Path path = status.getPath();
        //System.out.println("Path: <<<<<<< "+ path);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        IntWritable key = new IntWritable();
        PageRankNodeUpd value = new PageRankNodeUpd();
        while (reader.next(key, value)) {
            for (int i = 0; i < sourceList.size(); i++) {
                queueList.get(i).add(key.get(), value.getPageRankList().get(i));
                //System.out.println(key.get() + " | " + value.getPageRankList().get(i));
            }

        }
        reader.close();
    }
    //System.out.println("List : <<<<<<<"+queueList.size());
    for (int i = 0; i < sourceList.size(); i++) {
        TopScoredObjects<Integer> queue = queueList.get(i);
        System.out.println("Source : <<<<<<<" + sourceList.get(i));
        for (PairOfObjectFloat<Integer> pair : queue.extractAll()) {
            int nodeid = ((Integer) pair.getLeftElement());
            float pagerank = (float) Math.exp(pair.getRightElement());
            System.out.println(String.format("%.5f %d", pagerank, nodeid));
        }
    }
}

From source file:edu.umn.cs.spatialHadoop.core.RectangleNN.java

License:Open Source License

public static <S1 extends Shape, S2 extends Shape> int SpatialJoin_planeSweepFilterOnly(final List<S1> R,
        final List<S2> S, final ResultCollector2<S1, S2> output, Reporter reporter) throws IOException {

    LOG.debug("Start spatial join plan sweep algorithm !!!");

    final RectangleID[] Rmbrs = new RectangleID[R.size()];
    for (int i = 0; i < R.size(); i++) {
        Rmbrs[i] = new RectangleID(i, R.get(i).getMBR());
    }//from w w w . jav a  2 s.  c  o  m
    final RectangleID[] Smbrs = new RectangleID[S.size()];
    for (int i = 0; i < S.size(); i++) {
        Smbrs[i] = new RectangleID(i, S.get(i).getMBR());
    }

    final IntWritable count = new IntWritable();
    int filterCount = SpatialJoin_rectangles(Rmbrs, Smbrs, new OutputCollector<RectangleID, RectangleID>() {
        @Override
        public void collect(RectangleID r1, RectangleID r2) throws IOException {
            //if (R.get(r1.id).isIntersected(S.get(r2.id))) {
            if (output != null)
                output.collect(R.get(r1.id), S.get(r2.id));
            count.set(count.get() + 1);
            //}
        }
    }, reporter);

    LOG.debug("Filtered result size " + filterCount + ", refined result size " + count.get());

    return count.get();
}