Example usage for org.apache.hadoop.mapred MapFileOutputFormat getReaders

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred MapFileOutputFormat getReaders.

Prototype

public static MapFile.Reader[] getReaders(FileSystem ignored, Path dir, Configuration conf) throws IOException

Source Link

Document

Open the output generated by this format.

Usage

From source file:com.peer2gear.nutch.xquery.XQueryParseFilter.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = NutchConfiguration.create();
    Content content = null;/*from w  w  w  .ja v a2 s . c o m*/
    if (args.length < 1) {
        usage();
        return;
    }
    String urlStr = args[0];
    String segment = null;
    if (args.length == 2) {
        segment = args[1];
    }
    if (segment != null) {
        Path file = new Path(segment, Content.DIR_NAME);
        FileSystem fs = FileSystem.get(conf);
        System.out.println("path: " + file.toString());
        Reader[] readers = MapFileOutputFormat.getReaders(fs, file, conf);
        content = new Content();
        for (Reader reader : readers) {
            if (reader.get(new Text(urlStr), content) != null)
                continue;
        }
        for (Reader reader : readers)
            reader.close();
    } else {
        content = createContent(conf, urlStr);
    }
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    String result = parse.getData().getMeta(XQueryParseFilter.METADATA_FIELD);
    System.out.println(result);
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

private void openReaders(String crawlDb, Configuration config) throws IOException {
    if (readers != null)
        return;//from  www .  j  av  a2  s. com
    FileSystem fs = FileSystem.get(config);
    readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config);
}

From source file:org.apache.nutch.crawl.LinkDbReader.java

License:Apache License

public Inlinks getInlinks(Text url) throws IOException {

    if (readers == null) {
        synchronized (this) {
            readers = MapFileOutputFormat.getReaders(fs, new Path(directory, LinkDb.CURRENT_NAME), getConf());
        }/*from   w  ww  . java2  s  .  c o m*/
    }

    return (Inlinks) MapFileOutputFormat.getEntry(readers, PARTITIONER, url, new Inlinks());
}

From source file:org.apache.nutch.scoring.webgraph.LoopReader.java

License:Apache License

/**
 * Prints loopset for a single url.  The loopset information will show any
 * outlink url the eventually forms a link cycle.
 * /*  w  w w .  j  av a2  s  . co  m*/
 * @param webGraphDb The WebGraph to check for loops
 * @param url The url to check.
 * 
 * @throws IOException If an error occurs while printing loopset information.
 */
public void dumpUrl(Path webGraphDb, String url) throws IOException {

    // open the readers
    fs = FileSystem.get(getConf());
    loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, Loops.LOOPS_DIR), getConf());

    // get the loopset for a given url, if any
    Text key = new Text(url);
    LoopSet loop = new LoopSet();
    MapFileOutputFormat.getEntry(loopReaders, new HashPartitioner<Text, LoopSet>(), key, loop);

    // print out each loop url in the set
    System.out.println(url + ":");
    for (String loopUrl : loop.getLoopSet()) {
        System.out.println("  " + loopUrl);
    }

    // close the readers
    FSUtils.closeReaders(loopReaders);
}

From source file:org.apache.nutch.scoring.webgraph.NodeReader.java

License:Apache License

/**
 * Prints the content of the Node represented by the url to system out.
 * /*  w  ww . j ava  2  s .c o m*/
 * @param webGraphDb The webgraph from which to get the node.
 * @param url The url of the node.
 * 
 * @throws IOException If an error occurs while getting the node.
 */
public void dumpUrl(Path webGraphDb, String url) throws IOException {

    fs = FileSystem.get(getConf());
    nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, WebGraph.NODE_DIR), getConf());

    // open the readers, get the node, print out the info, and close the readers
    Text key = new Text(url);
    Node node = new Node();
    MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<Text, Node>(), key, node);
    System.out.println(url + ":");
    System.out.println("  inlink score: " + node.getInlinkScore());
    System.out.println("  outlink score: " + node.getOutlinkScore());
    System.out.println("  num inlinks: " + node.getNumInlinks());
    System.out.println("  num outlinks: " + node.getNumOutlinks());
    FSUtils.closeReaders(nodeReaders);
}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

private List<Writable> getMapRecords(Path dir, Text key) throws Exception {
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir, getConf());
    ArrayList<Writable> res = new ArrayList<Writable>();
    Class keyClass = readers[0].getKeyClass();
    Class valueClass = readers[0].getValueClass();
    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
        throw new IOException("Incompatible key (" + keyClass.getName() + ")");
    Writable value = (Writable) valueClass.newInstance();
    // we don't know the partitioning schema
    for (int i = 0; i < readers.length; i++) {
        if (readers[i].get(key, value) != null)
            res.add(value);/*from  www.j a v  a 2 s  .  c  om*/
        readers[i].close();
    }
    return res;
}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(),
            new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    long cnt = 0L;
    Text key = new Text();
    for (int i = 0; i < readers.length; i++) {
        while (readers[i].next(key))
            cnt++;//from   w  ww .j  av a 2  s .  c  o  m
        readers[i].close();
    }
    stats.generated = cnt;
    Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long start = Long.MAX_VALUE;
        long end = Long.MIN_VALUE;
        CrawlDatum value = new CrawlDatum();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (value.getFetchTime() < start)
                    start = value.getFetchTime();
                if (value.getFetchTime() > end)
                    end = value.getFetchTime();
            }
            mreaders[i].close();
        }
        stats.start = start;
        stats.end = end;
        stats.fetched = cnt;
    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long errors = 0L;
        ParseData value = new ParseData();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (!value.getStatus().isSuccess())
                    errors++;
            }
            mreaders[i].close();
        }
        stats.parsed = cnt;
        stats.parseErrors = errors;
    }
}

From source file:org.apache.nutch.segment.TestSegmentMerger.java

License:Apache License

public void testLargeMerge() throws Exception {
    SegmentMerger merger = new SegmentMerger(conf);
    merger.merge(out, new Path[] { seg1, seg2 }, false, false, -1);
    // verify output
    FileStatus[] stats = fs.listStatus(out);
    // there should be just one path
    assertEquals(1, stats.length);/*from   ww  w  .ja  va  2s . com*/
    Path outSeg = stats[0].getPath();
    Text k = new Text();
    ParseText v = new ParseText();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf);
    int cnt1 = 0, cnt2 = 0;
    for (MapFile.Reader r : readers) {
        while (r.next(k, v)) {
            String ks = k.toString();
            String vs = v.getText();
            if (ks.startsWith("seg1-")) {
                cnt1++;
                assertTrue(vs.startsWith("seg1 "));
            } else if (ks.startsWith("seg2-")) {
                cnt2++;
                assertTrue(vs.startsWith("seg2 "));
            }
        }
        r.close();
    }
    assertEquals(countSeg1, cnt1);
    assertEquals(countSeg2, cnt2);
}

From source file:org.archive.tnh.nutch.Segments.java

License:Apache License

public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
    this.parseTextReaders = MapFileOutputFormat.getReaders(fs, new Path(segmentDir, "parse_text"), conf);
}