Example usage for org.apache.hadoop.mapred MapFileOutputFormat getReaders

List of usage examples for org.apache.hadoop.mapred MapFileOutputFormat getReaders

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred MapFileOutputFormat getReaders.

Prototype

public static MapFile.Reader[] getReaders(FileSystem ignored, Path dir, Configuration conf) throws IOException 

Source Link

Document

Open the output generated by this format.

Usage

From source file:com.peer2gear.nutch.xquery.XQueryParseFilter.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = NutchConfiguration.create();
    Content content = null;/*from w  w  w  .ja v a2 s . c o m*/
    if (args.length < 1) {
        usage();
        return;
    }
    String urlStr = args[0];
    String segment = null;
    if (args.length == 2) {
        segment = args[1];
    }
    if (segment != null) {
        Path file = new Path(segment, Content.DIR_NAME);
        FileSystem fs = FileSystem.get(conf);
        System.out.println("path: " + file.toString());
        Reader[] readers = MapFileOutputFormat.getReaders(fs, file, conf);
        content = new Content();
        for (Reader reader : readers) {
            if (reader.get(new Text(urlStr), content) != null)
                continue;
        }
        for (Reader reader : readers)
            reader.close();
    } else {
        content = createContent(conf, urlStr);
    }
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    String result = parse.getData().getMeta(XQueryParseFilter.METADATA_FIELD);
    System.out.println(result);
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

private void openReaders(String crawlDb, Configuration config) throws IOException {
    if (readers != null)
        return;//from  www .  j  av  a2  s. com
    FileSystem fs = FileSystem.get(config);
    readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config);
}

From source file:org.apache.nutch.crawl.LinkDbReader.java

License:Apache License

public Inlinks getInlinks(Text url) throws IOException {

    if (readers == null) {
        synchronized (this) {
            readers = MapFileOutputFormat.getReaders(fs, new Path(directory, LinkDb.CURRENT_NAME), getConf());
        }/*from   w  ww  . java2  s  .  c o m*/
    }

    return (Inlinks) MapFileOutputFormat.getEntry(readers, PARTITIONER, url, new Inlinks());
}

From source file:org.apache.nutch.scoring.webgraph.LoopReader.java

License:Apache License

/**
 * Prints loopset for a single url.  The loopset information will show any
 * outlink url the eventually forms a link cycle.
 * /*  w  w w .  j  av a2  s  . co  m*/
 * @param webGraphDb The WebGraph to check for loops
 * @param url The url to check.
 * 
 * @throws IOException If an error occurs while printing loopset information.
 */
public void dumpUrl(Path webGraphDb, String url) throws IOException {

    // open the readers
    fs = FileSystem.get(getConf());
    loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, Loops.LOOPS_DIR), getConf());

    // get the loopset for a given url, if any
    Text key = new Text(url);
    LoopSet loop = new LoopSet();
    MapFileOutputFormat.getEntry(loopReaders, new HashPartitioner<Text, LoopSet>(), key, loop);

    // print out each loop url in the set
    System.out.println(url + ":");
    for (String loopUrl : loop.getLoopSet()) {
        System.out.println("  " + loopUrl);
    }

    // close the readers
    FSUtils.closeReaders(loopReaders);
}

From source file:org.apache.nutch.scoring.webgraph.NodeReader.java

License:Apache License

/**
 * Prints the content of the Node represented by the url to system out.
 * /*  w  ww . j ava  2  s .c o m*/
 * @param webGraphDb The webgraph from which to get the node.
 * @param url The url of the node.
 * 
 * @throws IOException If an error occurs while getting the node.
 */
public void dumpUrl(Path webGraphDb, String url) throws IOException {

    fs = FileSystem.get(getConf());
    nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, WebGraph.NODE_DIR), getConf());

    // open the readers, get the node, print out the info, and close the readers
    Text key = new Text(url);
    Node node = new Node();
    MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<Text, Node>(), key, node);
    System.out.println(url + ":");
    System.out.println("  inlink score: " + node.getInlinkScore());
    System.out.println("  outlink score: " + node.getOutlinkScore());
    System.out.println("  num inlinks: " + node.getNumInlinks());
    System.out.println("  num outlinks: " + node.getNumOutlinks());
    FSUtils.closeReaders(nodeReaders);
}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

private List<Writable> getMapRecords(Path dir, Text key) throws Exception {
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir, getConf());
    ArrayList<Writable> res = new ArrayList<Writable>();
    Class keyClass = readers[0].getKeyClass();
    Class valueClass = readers[0].getValueClass();
    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
        throw new IOException("Incompatible key (" + keyClass.getName() + ")");
    Writable value = (Writable) valueClass.newInstance();
    // we don't know the partitioning schema
    for (int i = 0; i < readers.length; i++) {
        if (readers[i].get(key, value) != null)
            res.add(value);/*from  www.j a v  a 2 s  .  c  om*/
        readers[i].close();
    }
    return res;
}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(),
            new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    long cnt = 0L;
    Text key = new Text();
    for (int i = 0; i < readers.length; i++) {
        while (readers[i].next(key))
            cnt++;//from   w  ww .j  av a 2  s .  c  o  m
        readers[i].close();
    }
    stats.generated = cnt;
    Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long start = Long.MAX_VALUE;
        long end = Long.MIN_VALUE;
        CrawlDatum value = new CrawlDatum();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (value.getFetchTime() < start)
                    start = value.getFetchTime();
                if (value.getFetchTime() > end)
                    end = value.getFetchTime();
            }
            mreaders[i].close();
        }
        stats.start = start;
        stats.end = end;
        stats.fetched = cnt;
    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long errors = 0L;
        ParseData value = new ParseData();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (!value.getStatus().isSuccess())
                    errors++;
            }
            mreaders[i].close();
        }
        stats.parsed = cnt;
        stats.parseErrors = errors;
    }
}

From source file:org.apache.nutch.segment.TestSegmentMerger.java

License:Apache License

public void testLargeMerge() throws Exception {
    SegmentMerger merger = new SegmentMerger(conf);
    merger.merge(out, new Path[] { seg1, seg2 }, false, false, -1);
    // verify output
    FileStatus[] stats = fs.listStatus(out);
    // there should be just one path
    assertEquals(1, stats.length);/*from   ww  w  .ja  va  2s . com*/
    Path outSeg = stats[0].getPath();
    Text k = new Text();
    ParseText v = new ParseText();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf);
    int cnt1 = 0, cnt2 = 0;
    for (MapFile.Reader r : readers) {
        while (r.next(k, v)) {
            String ks = k.toString();
            String vs = v.getText();
            if (ks.startsWith("seg1-")) {
                cnt1++;
                assertTrue(vs.startsWith("seg1 "));
            } else if (ks.startsWith("seg2-")) {
                cnt2++;
                assertTrue(vs.startsWith("seg2 "));
            }
        }
        r.close();
    }
    assertEquals(countSeg1, cnt1);
    assertEquals(countSeg2, cnt2);
}

From source file:org.archive.tnh.nutch.Segments.java

License:Apache License

public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
    this.parseTextReaders = MapFileOutputFormat.getReaders(fs, new Path(segmentDir, "parse_text"), conf);
}