Example usage for org.apache.hadoop.mapreduce.lib.output MapFileOutputFormat getReaders

List of usage examples for org.apache.hadoop.mapreduce.lib.output MapFileOutputFormat getReaders

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output MapFileOutputFormat getReaders.

Prototype

public static MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException 

Source Link

Document

Open the output generated by this format.

Usage

From source file:com.github.ygf.pagerank.InLinksTopNReducer.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("inlinks.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    int[] inLinks = new int[topN.size()];
    String[] titles = new String[topN.size()];

    for (int i = inLinks.length - 1; i >= 0; i--) {
        Map.Entry<Integer, Integer> entry = topN.poll();
        page.set(entry.getValue());/*from  ww  w .  j av  a  2 s .c  om*/
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        inLinks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < inLinks.length; i++) {
        context.write(new IntWritable(inLinks[i]), new Text(titles[i]));
    }
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private int getNumPages(Configuration conf, Path titlesDir) throws Exception {

    int numPages = 0;

    IntWritable pageNumber = new IntWritable();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    for (int i = 0; i < readers.length; i++) {
        readers[i].finalKey(pageNumber);
        if (pageNumber.get() > numPages) {
            numPages = pageNumber.get();
        }// w w w  .  j  ava  2 s .c  om
    }
    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    return numPages;
}

From source file:com.github.ygf.pagerank.PageRankIterationMapper.java

License:Apache License

@Override
public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue, Context context)
        throws IOException, InterruptedException {

    // This task gets each block M_{i,j}, loads the corresponding stripe j
    // of the vector v_{k-1} and produces the partial result of the stripe i
    // of the vector v_k.

    Configuration conf = context.getConfiguration();
    int iter = Integer.parseInt(conf.get("pagerank.iteration"));
    int numPages = Integer.parseInt(conf.get("pagerank.num_pages"));
    short blockSize = Short.parseShort(conf.get("pagerank.block_size"));

    Writable[] blockIndexes = inKey.get();
    short i = ((ShortWritable) blockIndexes[0]).get();
    short j = ((ShortWritable) blockIndexes[1]).get();

    int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vj = new FloatWritable[vjSize];

    if (iter == 1) {
        // Initial PageRank vector with 1/n for all pages.
        for (int k = 0; k < vj.length; k++) {
            vj[k] = new FloatWritable(1.0f / numPages);
        }/*from www . ja  v a  2 s  . c om*/
    } else {
        // Load the stripe j of the vector v_{k-1} from the MapFiles.
        Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent();
        Path vjDir = new Path(outputDir, "v" + (iter - 1));
        MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf);
        Partitioner<ShortWritable, FloatArrayWritable> partitioner = new HashPartitioner<ShortWritable, FloatArrayWritable>();
        ShortWritable key = new ShortWritable(j);
        FloatArrayWritable value = new FloatArrayWritable();
        MapFileOutputFormat.getEntry(readers, partitioner, key, value);
        Writable[] writables = value.get();
        for (int k = 0; k < vj.length; k++) {
            vj[k] = (FloatWritable) writables[k];
        }
        for (MapFile.Reader reader : readers) {
            reader.close();
        }
    }

    // Initialize the partial result i of the vector v_k.
    int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vi = new FloatWritable[viSize];
    for (int k = 0; k < vi.length; k++) {
        vi[k] = new FloatWritable(0);
    }

    // Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the
    // partial result i of the vector v_k.
    Writable[][] blockColumns = inValue.get();
    for (int k = 0; k < blockColumns.length; k++) {
        Writable[] blockColumn = blockColumns[k];
        if (blockColumn.length > 0) {
            int vDegree = ((ShortWritable) blockColumn[0]).get();
            for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) {
                int l = ((ShortWritable) blockColumn[columnIndex]).get();
                vi[l].set(vi[l].get() + (1.0f / vDegree) * vj[k].get());
            }
        }
    }

    context.write(new ShortWritable(i), new FloatArrayWritable(vi));
}

From source file:com.github.ygf.pagerank.PageRankTopNReducer.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("pagerank.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    float[] pageRanks = new float[topN.size()];
    String[] titles = new String[topN.size()];

    // The order of the entries is reversed. The priority queue is in
    // non-decreasing order and we want the highest PageRank first.
    for (int i = pageRanks.length - 1; i >= 0; i--) {
        Map.Entry<Float, Integer> entry = topN.poll();
        // Get the title of the page from the title index.
        page.set(entry.getValue());//from   w w w . ja  v  a2 s .  com
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        pageRanks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < pageRanks.length; i++) {
        context.write(new FloatWritable(pageRanks[i]), new Text(titles[i]));
    }
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*  w w  w .j  a v a2  s .c om*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = /*[*/MapFileOutputFormat.getReaders(path, getConf())/*]*/;
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();
        Writable entry = /*[*/MapFileOutputFormat.getEntry(readers, partitioner, key, val)/*]*/;
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        parser.parse(val.toString());
        System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*from  w  w  w.j  a  v  a2  s.c o m*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        // vv LookupRecordsByTemperature-ReaderFragment
        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        // ^^ LookupRecordsByTemperature-ReaderFragment
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:org.apache.nutch.segment.TestSegmentMergerCrawlDatums.java

License:Apache License

/**
 * Checks the merged segment and removes the stuff again.
 * /*www .  java2 s.c o  m*/
 * @param the
 *          test directory
 * @param the
 *          merged segment
 * @return the final status
 */
protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception {
    // Get a MapFile reader for the <Text,CrawlDatum> pairs
    MapFile.Reader[] readers = MapFileOutputFormat
            .getReaders(new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);

    Text key = new Text();
    CrawlDatum value = new CrawlDatum();
    byte finalStatus = 0x0;

    for (MapFile.Reader reader : readers) {
        while (reader.next(key, value)) {
            LOG.info("Reading status for: " + key.toString() + " > "
                    + CrawlDatum.getStatusName(value.getStatus()));

            // Only consider fetch status
            if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
                finalStatus = value.getStatus();
            }
        }

        // Close the reader again
        reader.close();
    }

    // Remove the test directory again
    fs.delete(testDir, true);

    LOG.info("Final fetch status for: http://nutch.apache.org/ > " + CrawlDatum.getStatusName(finalStatus));

    // Return the final status
    return finalStatus;
}