List of usage examples for org.apache.hadoop.mapred MapFileOutputFormat getReaders
public static MapFile.Reader[] getReaders(FileSystem ignored, Path dir, Configuration conf) throws IOException
From source file:com.peer2gear.nutch.xquery.XQueryParseFilter.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = NutchConfiguration.create(); Content content = null;/*from w w w .ja v a2 s . c o m*/ if (args.length < 1) { usage(); return; } String urlStr = args[0]; String segment = null; if (args.length == 2) { segment = args[1]; } if (segment != null) { Path file = new Path(segment, Content.DIR_NAME); FileSystem fs = FileSystem.get(conf); System.out.println("path: " + file.toString()); Reader[] readers = MapFileOutputFormat.getReaders(fs, file, conf); content = new Content(); for (Reader reader : readers) { if (reader.get(new Text(urlStr), content) != null) continue; } for (Reader reader : readers) reader.close(); } else { content = createContent(conf, urlStr); } Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); String result = parse.getData().getMeta(XQueryParseFilter.METADATA_FIELD); System.out.println(result); }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
private void openReaders(String crawlDb, Configuration config) throws IOException { if (readers != null) return;//from www . j av a2 s. com FileSystem fs = FileSystem.get(config); readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config); }
From source file:org.apache.nutch.crawl.LinkDbReader.java
License:Apache License
public Inlinks getInlinks(Text url) throws IOException { if (readers == null) { synchronized (this) { readers = MapFileOutputFormat.getReaders(fs, new Path(directory, LinkDb.CURRENT_NAME), getConf()); }/*from w ww . java2 s . c o m*/ } return (Inlinks) MapFileOutputFormat.getEntry(readers, PARTITIONER, url, new Inlinks()); }
From source file:org.apache.nutch.scoring.webgraph.LoopReader.java
License:Apache License
/** * Prints loopset for a single url. The loopset information will show any * outlink url the eventually forms a link cycle. * /* w w w . j av a2 s . co m*/ * @param webGraphDb The WebGraph to check for loops * @param url The url to check. * * @throws IOException If an error occurs while printing loopset information. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { // open the readers fs = FileSystem.get(getConf()); loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, Loops.LOOPS_DIR), getConf()); // get the loopset for a given url, if any Text key = new Text(url); LoopSet loop = new LoopSet(); MapFileOutputFormat.getEntry(loopReaders, new HashPartitioner<Text, LoopSet>(), key, loop); // print out each loop url in the set System.out.println(url + ":"); for (String loopUrl : loop.getLoopSet()) { System.out.println(" " + loopUrl); } // close the readers FSUtils.closeReaders(loopReaders); }
From source file:org.apache.nutch.scoring.webgraph.NodeReader.java
License:Apache License
/** * Prints the content of the Node represented by the url to system out. * /* w ww . j ava 2 s .c o m*/ * @param webGraphDb The webgraph from which to get the node. * @param url The url of the node. * * @throws IOException If an error occurs while getting the node. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { fs = FileSystem.get(getConf()); nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, WebGraph.NODE_DIR), getConf()); // open the readers, get the node, print out the info, and close the readers Text key = new Text(url); Node node = new Node(); MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<Text, Node>(), key, node); System.out.println(url + ":"); System.out.println(" inlink score: " + node.getInlinkScore()); System.out.println(" outlink score: " + node.getOutlinkScore()); System.out.println(" num inlinks: " + node.getNumInlinks()); System.out.println(" num outlinks: " + node.getNumOutlinks()); FSUtils.closeReaders(nodeReaders); }
From source file:org.apache.nutch.segment.SegmentReader.java
License:Apache License
private List<Writable> getMapRecords(Path dir, Text key) throws Exception { MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir, getConf()); ArrayList<Writable> res = new ArrayList<Writable>(); Class keyClass = readers[0].getKeyClass(); Class valueClass = readers[0].getValueClass(); if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) throw new IOException("Incompatible key (" + keyClass.getName() + ")"); Writable value = (Writable) valueClass.newInstance(); // we don't know the partitioning schema for (int i = 0; i < readers.length; i++) { if (readers[i].get(key, value) != null) res.add(value);/*from www.j a v a 2 s . c om*/ readers[i].close(); } return res; }
From source file:org.apache.nutch.segment.SegmentReader.java
License:Apache License
public void getStats(Path segment, final SegmentReaderStats stats) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); long cnt = 0L; Text key = new Text(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(key)) cnt++;//from w ww .j av a 2 s . c o m readers[i].close(); } stats.generated = cnt; Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long start = Long.MAX_VALUE; long end = Long.MIN_VALUE; CrawlDatum value = new CrawlDatum(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (value.getFetchTime() < start) start = value.getFetchTime(); if (value.getFetchTime() > end) end = value.getFetchTime(); } mreaders[i].close(); } stats.start = start; stats.end = end; stats.fetched = cnt; } Path parseDir = new Path(segment, ParseData.DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long errors = 0L; ParseData value = new ParseData(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (!value.getStatus().isSuccess()) errors++; } mreaders[i].close(); } stats.parsed = cnt; stats.parseErrors = errors; } }
From source file:org.apache.nutch.segment.TestSegmentMerger.java
License:Apache License
public void testLargeMerge() throws Exception { SegmentMerger merger = new SegmentMerger(conf); merger.merge(out, new Path[] { seg1, seg2 }, false, false, -1); // verify output FileStatus[] stats = fs.listStatus(out); // there should be just one path assertEquals(1, stats.length);/*from ww w .ja va 2s . com*/ Path outSeg = stats[0].getPath(); Text k = new Text(); ParseText v = new ParseText(); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf); int cnt1 = 0, cnt2 = 0; for (MapFile.Reader r : readers) { while (r.next(k, v)) { String ks = k.toString(); String vs = v.getText(); if (ks.startsWith("seg1-")) { cnt1++; assertTrue(vs.startsWith("seg1 ")); } else if (ks.startsWith("seg2-")) { cnt2++; assertTrue(vs.startsWith("seg2 ")); } } r.close(); } assertEquals(countSeg1, cnt1); assertEquals(countSeg2, cnt2); }
From source file:org.archive.tnh.nutch.Segments.java
License:Apache License
public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException { this.parseTextReaders = MapFileOutputFormat.getReaders(fs, new Path(segmentDir, "parse_text"), conf); }