Example usage for org.apache.hadoop.mapred SequenceFileOutputFormat getReaders

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileOutputFormat getReaders.

Prototype

public static SequenceFile.Reader[] getReaders(Configuration conf, Path dir) throws IOException

Source Link

Document

Open the output generated by this format.

Usage

From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java

License:Apache License

private void generateXMLdocs(Path input, File dir, int[] count) throws IOException {

    Configuration conf = BehemothConfiguration.create();
    Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input);
    for (Reader current : cacheReaders) {
        // read the key + values in that file
        Text key = new Text();
        BehemothDocument inputDoc = new BehemothDocument();
        BufferedWriter writer = null;
        gate.Document gatedocument = null;
        while (current.next(key, inputDoc)) {
            count[0]++;/*from w  w w.  j  a  v a2s  .c  om*/
            // generate a GATE document then save it to XML
            try {
                // first put the text
                GATEProcessor gp = new GATEProcessor(new URL("http://dummy.com"));
                gp.setConfig(conf);
                gatedocument = gp.generateGATEDoc(inputDoc);

                // then save as XML
                File outputFile = new File(dir, count[0] + ".xml");
                if (outputFile.exists() == false)
                    outputFile.createNewFile();

                writer = new BufferedWriter(new FileWriter(outputFile));
                writer.write(gatedocument.toXml());

            } catch (Exception e) {
                LOG.error("Exception on doc [" + count[0] + "] " + key.toString(), e);
            } finally {
                if (writer != null)
                    writer.close();
                if (gatedocument != null)
                    Factory.deleteResource(gatedocument);
            }
        }
        current.close();
    }
}

From source file:com.digitalpebble.behemoth.util.ContentExtractor.java

License:Apache License

private void generateDocs(Path input, Path dir, int[] count) throws IOException, ArchiveException {

    DocumentFilter docFilter = DocumentFilter.getFilters(getConf());

    Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input);
    for (Reader current : cacheReaders) {
        // read the key + values in that file
        Text key = new Text();
        BehemothDocument inputDoc = new BehemothDocument();
        while (current.next(key, inputDoc)) {
            count[0]++;//from w w  w .jav a 2s  . c  om
            // filter the doc?
            if (!docFilter.keep(inputDoc))
                continue;
            if (dumpBinary && inputDoc.getContent() == null)
                continue;
            else if (!dumpBinary && inputDoc.getText() == null)
                continue;

            String fileName = Integer.toString(count[0]);
            String urldoc = inputDoc.getUrl();
            if (mode.equals(FileNamingMode.URL) && urldoc != null && urldoc.length() > 0) {
                fileName = URLEncoder.encode(urldoc, "UTF-8");
            } else if (mode.equals(FileNamingMode.UUID) && urldoc != null && urldoc.length() > 0) {
                fileName = UUID.nameUUIDFromBytes(urldoc.getBytes()).toString();
            } else {
                fileName = String.format("%09d", count[0]);
            }

            if (!dumpBinary)
                fileName += ".txt";

            byte[] contentBytes;
            if (dumpBinary)
                contentBytes = inputDoc.getContent();
            else
                contentBytes = inputDoc.getText().getBytes("UTF-8");
            // out.write(contentBytes, 0, contentBytes.length);
            addToArchive(fileName, contentBytes, dir);

            // add the mapping URL->filename in the index -> archive num
            index.writeBytes(urldoc + "\t" + fileName + "\t" + String.format("%06d", partNum) + "\n");
        }
        current.close();
    }
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }//w w w. ja va  2s  . co m

    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);
    job.setBoolean("db.reader.stats.sort", sort);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // https://issues.apache.org/jira/browse/NUTCH-1029
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);

    Text key = new Text();
    LongWritable value = new LongWritable();

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = stats.get("T");
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
            String k = entry.getKey();
            LongWritable val = entry.getValue();
            if (k.equals("scn")) {
                LOG.info("min score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else
                LOG.info(k + ":\t" + val);
        }
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }

}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

private List<Writable> getSeqRecords(Path dir, Text key) throws Exception {
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir);
    ArrayList<Writable> res = new ArrayList<Writable>();
    Class keyClass = readers[0].getKeyClass();
    Class valueClass = readers[0].getValueClass();
    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
        throw new IOException("Incompatible key (" + keyClass.getName() + ")");
    Writable aKey = (Writable) keyClass.newInstance();
    Writable value = (Writable) valueClass.newInstance();
    for (int i = 0; i < readers.length; i++) {
        while (readers[i].next(aKey, value)) {
            if (aKey.equals(key))
                res.add(value);/* w  ww  .  jav  a  2s.  c o m*/
        }
        readers[i].close();
    }
    return res;
}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(),
            new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    long cnt = 0L;
    Text key = new Text();
    for (int i = 0; i < readers.length; i++) {
        while (readers[i].next(key))
            cnt++;//from  w  w w.  j  av a  2 s .  c o  m
        readers[i].close();
    }
    stats.generated = cnt;
    Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long start = Long.MAX_VALUE;
        long end = Long.MIN_VALUE;
        CrawlDatum value = new CrawlDatum();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (value.getFetchTime() < start)
                    start = value.getFetchTime();
                if (value.getFetchTime() > end)
                    end = value.getFetchTime();
            }
            mreaders[i].close();
        }
        stats.start = start;
        stats.end = end;
        stats.fetched = cnt;
    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
        cnt = 0L;
        long errors = 0L;
        ParseData value = new ParseData();
        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
        for (int i = 0; i < mreaders.length; i++) {
            while (mreaders[i].next(key, value)) {
                cnt++;
                if (!value.getStatus().isSuccess())
                    errors++;
            }
            mreaders[i].close();
        }
        stats.parsed = cnt;
        stats.parseErrors = errors;
    }
}