Example usage for org.apache.hadoop.io ObjectWritable ObjectWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io ObjectWritable ObjectWritable.

Prototype

public ObjectWritable(Object instance)

Source Link

Usage

From source file:org.apache.nutch.indexer.Indexer.java

License:Apache License

public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
        throws IOException {

    Inlinks inlinks = null;/* ww  w .  ja v  a 2 s. c o m*/
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    CrawlDatum redir = null;
    ParseData parseData = null;
    ParseText parseText = null;
    Float pagerank = null; // TODO MC
    while (values.hasNext()) {
        Object value = ((ObjectWritable) values.next()).get(); // unwrap

        if (value instanceof Inlinks) {
            inlinks = (Inlinks) value;
        } else if (value instanceof CrawlDatum) {

            CrawlDatum datum = (CrawlDatum) value;
            if (CrawlDatum.hasDbStatus(datum))
                dbDatum = datum;
            else if (CrawlDatum.hasFetchStatus(datum))
                fetchDatum = datum;
            else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
                // redirected page
                redir = datum;
            else
                throw new RuntimeException("Unexpected status: " + datum.getStatus());
        } else if (value instanceof ParseData) {
            parseData = (ParseData) value;
        } else if (value instanceof ParseText) {
            parseText = (ParseText) value;
        } else if (value instanceof FloatWritable) { // TODO MC
            pagerank = ((FloatWritable) value).get();
        } else if (LOG.isWarnEnabled()) {
            LOG.warn("Unrecognized type: " + value.getClass());
        }
    }

    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
        LOG.info("index TREC: " + key.toString() + " " + (redir == null) + " " + (fetchDatum == null) + " "
                + (dbDatum == null) + " " + (parseText == null) + " " + (parseData == null) + " "
                + (inlinks == null) + " " + (pagerank == null));
    }

    if (redir != null) { // does not work - see http://www.mail-archive.com/nutch-commits@lucene.apache.org/msg01971.html
        // XXX page was redirected - what should we do?
        // XXX discard it for now

        LOG.info("index REDIR:" + redir); // sanity check
        return;
    }

    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
        if (fetchDatum == null /*|| dbDatum == null*/
                || parseText == null || parseData == null) {
            return; // only have inlinks
        }
    } else {
        if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
            return; // only have inlinks
        }
    }

    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();

    if (metadata.get(Nutch.SEGMENT_NAME_KEY) == null || metadata.get(Nutch.SIGNATURE_KEY) == null) {
        LOG.error("Metadata empty:" + key + " " + parseData.toString());
        return;
    }

    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY), Field.Store.YES, Field.Index.NO));

    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY), Field.Store.YES, Field.Index.NO));

    Parse parse = new ParseImpl(parseText, parseData);
    try {
        // run indexing filters
        doc = this.filters.filter(doc, parse, (Text) key, fetchDatum, inlinks);
    } catch (IndexingException e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Error indexing " + key + ": " + e);
        }
        return;
    }

    float boost = 1.0f;
    // run scoring filters
    if (dbDatum != null || !collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
        try {
            boost = this.scfilters.indexerScore((Text) key, doc, dbDatum, fetchDatum, parse, inlinks, boost);
        } catch (ScoringFilterException e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error calculating score " + key + ": " + e);
            }
            return;
        }
    }

    // apply boost to all indexed fields.
    //    doc.setBoost(boost); - it uses the default 1.0f. if set, all fields will have this value boosted
    // store boost for use by explain and dedup
    doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("inlinks", (inlinks == null) ? "0" : Integer.toString(inlinks.size()), Field.Store.YES,
            Field.Index.NO));
    doc.add(new Field("outlinks",
            (parseData.getOutlinks() == null) ? "0" : Integer.toString(parseData.getOutlinks().length),
            Field.Store.YES, Field.Index.NO));
    doc.add(new Field("pagerank", (pagerank == null) ? "0" : Float.toString(pagerank), Field.Store.YES,
            Field.Index.NO));

    output.collect(key, new ObjectWritable(doc));
}

From source file:org.apache.nutch.indexer.Indexer.java

License:Apache License

public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter)
        throws IOException {
    output.collect(key, new ObjectWritable(value));
}