Example usage for org.apache.hadoop.mapred SequenceFileOutputFormat getReaders

List of usage examples for org.apache.hadoop.mapred SequenceFileOutputFormat getReaders

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileOutputFormat getReaders.

Prototype

public static SequenceFile.Reader[] getReaders(Configuration conf, Path dir) throws IOException 

Source Link

Document

Open the output generated by this format.

Usage

From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java

License:Apache License

private void generateXMLdocs(Path input, File dir, int[] count) throws IOException {

    Configuration conf = BehemothConfiguration.create();
    Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input);
    for (Reader current : cacheReaders) {
        // read the key + values in that file
        Text key = new Text();
        BehemothDocument inputDoc = new BehemothDocument();
        BufferedWriter writer = null;
        gate.Document gatedocument = null;
        while (current.next(key, inputDoc)) {
            count[0]++;/*from w  w w.  j  a  v a2s  .c  om*/
            // generate a GATE document then save it to XML
            try {
                // first put the text
                GATEProcessor gp = new GATEProcessor(new URL("http://dummy.com"));
                gp.setConfig(conf);
                gatedocument = gp.generateGATEDoc(inputDoc);

                // then save as XML
                File outputFile = new File(dir, count[0] + ".xml");
                if (outputFile.exists() == false)
                    outputFile.createNewFile();

                writer = new BufferedWriter(new FileWriter(outputFile));
                writer.write(gatedocument.toXml());

            } catch (Exception e) {
                LOG.error("Exception on doc [" + count[0] + "] " + key.toString(), e);
            } finally {
                if (writer != null)
                    writer.close();
                if (gatedocument != null)
                    Factory.deleteResource(gatedocument);
            }
        }
        current.close();
    }
}

From source file:com.digitalpebble.behemoth.util.ContentExtractor.java

License:Apache License

private void generateDocs(Path input, Path dir, int[] count) throws IOException, ArchiveException {

    DocumentFilter docFilter = DocumentFilter.getFilters(getConf());

    Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input);
    for (Reader current : cacheReaders) {
        // read the key + values in that file
        Text key = new Text();
        BehemothDocument inputDoc = new BehemothDocument();
        while (current.next(key, inputDoc)) {
            count[0]++;//from w w  w .jav a 2s  . c  om
            // filter the doc?
            if (!docFilter.keep(inputDoc))
                continue;
            if (dumpBinary && inputDoc.getContent() == null)
                continue;
            else if (!dumpBinary && inputDoc.getText() == null)
                continue;

            String fileName = Integer.toString(count[0]);
            String urldoc = inputDoc.getUrl();
            if (mode.equals(FileNamingMode.URL) && urldoc != null && urldoc.length() > 0) {
                fileName = URLEncoder.encode(urldoc, "UTF-8");
            } else if (mode.equals(FileNamingMode.UUID) && urldoc != null && urldoc.length() > 0) {
                fileName = UUID.nameUUIDFromBytes(urldoc.getBytes()).toString();
            } else {
                fileName = String.format("%09d", count[0]);
            }

            if (!dumpBinary)
                fileName += ".txt";

            byte[] contentBytes;
            if (dumpBinary)
                contentBytes = inputDoc.getContent();
            else
                contentBytes = inputDoc.getText().getBytes("UTF-8");
            // out.write(contentBytes, 0, contentBytes.length);
            addToArchive(fileName, contentBytes, dir);

            // add the mapping URL->filename in the index -> archive num
            index.writeBytes(urldoc + "\t" + fileName + "\t" + String.format("%06d", partNum) + "\n");
        }
        current.close();
    }
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }//w w w. ja va  2s  . co m

    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);
    job.setBoolean("db.reader.stats.sort", sort);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // https://issues.apache.org/jira/browse/NUTCH-1029
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);

    Text key = new Text();
    LongWritable value = new LongWritable();

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = stats.get("T");
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
            String k = entry.getKey();
            LongWritable val = entry.getValue();
            if (k.equals("scn")) {
                LOG.info("min score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else
                LOG.info(k + ":\t" + val);
        }
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }

}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

private List<Writable> getSeqRecords(Path dir, Text key) throws Exception {
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir);
    ArrayList<Writable> res = new ArrayList<Writable>();
    Class keyClass = readers[0].getKeyClass();
    Class valueClass = readers[0].getValueClass();
    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
        throw new IOException("Incompatible key (" + keyClass.getName() + ")");
    Writable aKey = (Writable) keyClass.newInstance();
    Writable value = (Writable) valueClass.newInstance();
    for (int i = 0; i < readers.length; i++) {
        while (readers[i].next(aKey, value)) {
            if (aKey.equals(key))
                res.add(value);/* w  ww  .  jav  a  2s.  c o m*/
        }
        readers[i].close();
    }
    return res;
}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(),
            new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    long cnt = 0L;
    Text key = new Text();
    for (int i = 0; i < readers.length; i++) {
        while (readers[i].next(key))
            cnt++;//from  w  w w.  j  av a  2 s .  c o  m
        readers[i].close();
    }
    stats.generated = cnt;
    Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long start = Long.MAX_VALUE;
        long end = Long.MIN_VALUE;
        CrawlDatum value = new CrawlDatum();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (value.getFetchTime() < start)
                    start = value.getFetchTime();
                if (value.getFetchTime() > end)
                    end = value.getFetchTime();
            }
            mreaders[i].close();
        }
        stats.start = start;
        stats.end = end;
        stats.fetched = cnt;
    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long errors = 0L;
        ParseData value = new ParseData();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (!value.getStatus().isSuccess())
                    errors++;
            }
            mreaders[i].close();
        }
        stats.parsed = cnt;
        stats.parseErrors = errors;
    }
}