List of usage examples for org.apache.hadoop.mapred SequenceFileOutputFormat getReaders
public static SequenceFile.Reader[] getReaders(Configuration conf, Path dir) throws IOException
From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java
License:Apache License
private void generateXMLdocs(Path input, File dir, int[] count) throws IOException { Configuration conf = BehemothConfiguration.create(); Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input); for (Reader current : cacheReaders) { // read the key + values in that file Text key = new Text(); BehemothDocument inputDoc = new BehemothDocument(); BufferedWriter writer = null; gate.Document gatedocument = null; while (current.next(key, inputDoc)) { count[0]++;/*from w w w. j a v a2s .c om*/ // generate a GATE document then save it to XML try { // first put the text GATEProcessor gp = new GATEProcessor(new URL("http://dummy.com")); gp.setConfig(conf); gatedocument = gp.generateGATEDoc(inputDoc); // then save as XML File outputFile = new File(dir, count[0] + ".xml"); if (outputFile.exists() == false) outputFile.createNewFile(); writer = new BufferedWriter(new FileWriter(outputFile)); writer.write(gatedocument.toXml()); } catch (Exception e) { LOG.error("Exception on doc [" + count[0] + "] " + key.toString(), e); } finally { if (writer != null) writer.close(); if (gatedocument != null) Factory.deleteResource(gatedocument); } } current.close(); } }
From source file:com.digitalpebble.behemoth.util.ContentExtractor.java
License:Apache License
private void generateDocs(Path input, Path dir, int[] count) throws IOException, ArchiveException { DocumentFilter docFilter = DocumentFilter.getFilters(getConf()); Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input); for (Reader current : cacheReaders) { // read the key + values in that file Text key = new Text(); BehemothDocument inputDoc = new BehemothDocument(); while (current.next(key, inputDoc)) { count[0]++;//from w w w .jav a 2s . c om // filter the doc? if (!docFilter.keep(inputDoc)) continue; if (dumpBinary && inputDoc.getContent() == null) continue; else if (!dumpBinary && inputDoc.getText() == null) continue; String fileName = Integer.toString(count[0]); String urldoc = inputDoc.getUrl(); if (mode.equals(FileNamingMode.URL) && urldoc != null && urldoc.length() > 0) { fileName = URLEncoder.encode(urldoc, "UTF-8"); } else if (mode.equals(FileNamingMode.UUID) && urldoc != null && urldoc.length() > 0) { fileName = UUID.nameUUIDFromBytes(urldoc.getBytes()).toString(); } else { fileName = String.format("%09d", count[0]); } if (!dumpBinary) fileName += ".txt"; byte[] contentBytes; if (dumpBinary) contentBytes = inputDoc.getContent(); else contentBytes = inputDoc.getText().getBytes("UTF-8"); // out.write(contentBytes, 0, contentBytes.length); addToArchive(fileName, contentBytes, dir); // add the mapping URL->filename in the index -> archive num index.writeBytes(urldoc + "\t" + fileName + "\t" + String.format("%06d", partNum) + "\n"); } current.close(); } }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics start: " + crawlDb); }//w w w. ja va 2s . co m Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(config); job.setJobName("stats " + crawlDb); job.setBoolean("db.reader.stats.sort", sort); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbStatMapper.class); job.setCombinerClass(CrawlDbStatCombiner.class); job.setReducerClass(CrawlDbStatReducer.class); FileOutputFormat.setOutputPath(job, tmpFolder); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // https://issues.apache.org/jira/browse/NUTCH-1029 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); JobClient.runJob(job); // reading the result FileSystem fileSystem = FileSystem.get(config); SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder); Text key = new Text(); LongWritable value = new LongWritable(); TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); if (k.equals("scx")) val.set(Long.MIN_VALUE); if (k.equals("scn")) val.set(Long.MAX_VALUE); stats.put(k, val); } if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } if (LOG.isInfoEnabled()) { LOG.info("Statistics for CrawlDb: " + crawlDb); LongWritable totalCnt = stats.get("T"); stats.remove("T"); LOG.info("TOTAL urls:\t" + totalCnt.get()); for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { LOG.info("min score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("scx")) { LOG.info("max score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("sct")) { LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) LOG.info(" " + st[2] + " :\t" + val); else LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val); } else LOG.info(k + ":\t" + val); } } // removing the tmp folder fileSystem.delete(tmpFolder, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } }
From source file:org.apache.nutch.segment.SegmentReader.java
License:Apache License
private List<Writable> getSeqRecords(Path dir, Text key) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir); ArrayList<Writable> res = new ArrayList<Writable>(); Class keyClass = readers[0].getKeyClass(); Class valueClass = readers[0].getValueClass(); if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) throw new IOException("Incompatible key (" + keyClass.getName() + ")"); Writable aKey = (Writable) keyClass.newInstance(); Writable value = (Writable) valueClass.newInstance(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(aKey, value)) { if (aKey.equals(key)) res.add(value);/* w ww . jav a 2s. c o m*/ } readers[i].close(); } return res; }
From source file:org.apache.nutch.segment.SegmentReader.java
License:Apache License
public void getStats(Path segment, final SegmentReaderStats stats) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); long cnt = 0L; Text key = new Text(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(key)) cnt++;//from w w w. j av a 2 s . c o m readers[i].close(); } stats.generated = cnt; Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long start = Long.MAX_VALUE; long end = Long.MIN_VALUE; CrawlDatum value = new CrawlDatum(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (value.getFetchTime() < start) start = value.getFetchTime(); if (value.getFetchTime() > end) end = value.getFetchTime(); } mreaders[i].close(); } stats.start = start; stats.end = end; stats.fetched = cnt; } Path parseDir = new Path(segment, ParseData.DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long errors = 0L; ParseData value = new ParseData(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (!value.getStatus().isSuccess()) errors++; } mreaders[i].close(); } stats.parsed = cnt; stats.parseErrors = errors; } }