Example usage for org.apache.hadoop.io ObjectWritable ObjectWritable

List of usage examples for org.apache.hadoop.io ObjectWritable ObjectWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io ObjectWritable ObjectWritable.

Prototype

public ObjectWritable(Object instance) 

Source Link

Usage

From source file:org.apache.nutch.indexer.Indexer.java

License:Apache License

public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
        throws IOException {

    Inlinks inlinks = null;/* ww  w .  ja v  a 2 s. c o m*/
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    CrawlDatum redir = null;
    ParseData parseData = null;
    ParseText parseText = null;
    Float pagerank = null; // TODO MC
    while (values.hasNext()) {
        Object value = ((ObjectWritable) values.next()).get(); // unwrap

        if (value instanceof Inlinks) {
            inlinks = (Inlinks) value;
        } else if (value instanceof CrawlDatum) {

            CrawlDatum datum = (CrawlDatum) value;
            if (CrawlDatum.hasDbStatus(datum))
                dbDatum = datum;
            else if (CrawlDatum.hasFetchStatus(datum))
                fetchDatum = datum;
            else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
                // redirected page
                redir = datum;
            else
                throw new RuntimeException("Unexpected status: " + datum.getStatus());
        } else if (value instanceof ParseData) {
            parseData = (ParseData) value;
        } else if (value instanceof ParseText) {
            parseText = (ParseText) value;
        } else if (value instanceof FloatWritable) { // TODO MC
            pagerank = ((FloatWritable) value).get();
        } else if (LOG.isWarnEnabled()) {
            LOG.warn("Unrecognized type: " + value.getClass());
        }
    }

    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
        LOG.info("index TREC: " + key.toString() + " " + (redir == null) + " " + (fetchDatum == null) + " "
                + (dbDatum == null) + " " + (parseText == null) + " " + (parseData == null) + " "
                + (inlinks == null) + " " + (pagerank == null));
    }

    if (redir != null) { // does not work - see http://www.mail-archive.com/nutch-commits@lucene.apache.org/msg01971.html
        // XXX page was redirected - what should we do?
        // XXX discard it for now

        LOG.info("index REDIR:" + redir); // sanity check
        return;
    }

    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
        if (fetchDatum == null /*|| dbDatum == null*/
                || parseText == null || parseData == null) {
            return; // only have inlinks
        }
    } else {
        if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
            return; // only have inlinks
        }
    }

    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();

    if (metadata.get(Nutch.SEGMENT_NAME_KEY) == null || metadata.get(Nutch.SIGNATURE_KEY) == null) {
        LOG.error("Metadata empty:" + key + " " + parseData.toString());
        return;
    }

    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY), Field.Store.YES, Field.Index.NO));

    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY), Field.Store.YES, Field.Index.NO));

    Parse parse = new ParseImpl(parseText, parseData);
    try {
        // run indexing filters
        doc = this.filters.filter(doc, parse, (Text) key, fetchDatum, inlinks);
    } catch (IndexingException e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Error indexing " + key + ": " + e);
        }
        return;
    }

    float boost = 1.0f;
    // run scoring filters
    if (dbDatum != null || !collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
        try {
            boost = this.scfilters.indexerScore((Text) key, doc, dbDatum, fetchDatum, parse, inlinks, boost);
        } catch (ScoringFilterException e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error calculating score " + key + ": " + e);
            }
            return;
        }
    }

    // apply boost to all indexed fields.
    //    doc.setBoost(boost); - it uses the default 1.0f. if set, all fields will have this value boosted
    // store boost for use by explain and dedup
    doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("inlinks", (inlinks == null) ? "0" : Integer.toString(inlinks.size()), Field.Store.YES,
            Field.Index.NO));
    doc.add(new Field("outlinks",
            (parseData.getOutlinks() == null) ? "0" : Integer.toString(parseData.getOutlinks().length),
            Field.Store.YES, Field.Index.NO));
    doc.add(new Field("pagerank", (pagerank == null) ? "0" : Float.toString(pagerank), Field.Store.YES,
            Field.Index.NO));

    output.collect(key, new ObjectWritable(doc));
}

From source file:org.apache.nutch.indexer.Indexer.java

License:Apache License

public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter)
        throws IOException {
    output.collect(key, new ObjectWritable(value));
}