List of usage examples for org.apache.hadoop.io ObjectWritable ObjectWritable
public ObjectWritable(Object instance)
From source file:org.apache.nutch.indexer.Indexer.java
License:Apache License
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { Inlinks inlinks = null;/* ww w . ja v a 2 s. c o m*/ CrawlDatum dbDatum = null; CrawlDatum fetchDatum = null; CrawlDatum redir = null; ParseData parseData = null; ParseText parseText = null; Float pagerank = null; // TODO MC while (values.hasNext()) { Object value = ((ObjectWritable) values.next()).get(); // unwrap if (value instanceof Inlinks) { inlinks = (Inlinks) value; } else if (value instanceof CrawlDatum) { CrawlDatum datum = (CrawlDatum) value; if (CrawlDatum.hasDbStatus(datum)) dbDatum = datum; else if (CrawlDatum.hasFetchStatus(datum)) fetchDatum = datum; else if (CrawlDatum.STATUS_LINKED == datum.getStatus()) // redirected page redir = datum; else throw new RuntimeException("Unexpected status: " + datum.getStatus()); } else if (value instanceof ParseData) { parseData = (ParseData) value; } else if (value instanceof ParseText) { parseText = (ParseText) value; } else if (value instanceof FloatWritable) { // TODO MC pagerank = ((FloatWritable) value).get(); } else if (LOG.isWarnEnabled()) { LOG.warn("Unrecognized type: " + value.getClass()); } } if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) { LOG.info("index TREC: " + key.toString() + " " + (redir == null) + " " + (fetchDatum == null) + " " + (dbDatum == null) + " " + (parseText == null) + " " + (parseData == null) + " " + (inlinks == null) + " " + (pagerank == null)); } if (redir != null) { // does not work - see http://www.mail-archive.com/nutch-commits@lucene.apache.org/msg01971.html // XXX page was redirected - what should we do? // XXX discard it for now LOG.info("index REDIR:" + redir); // sanity check return; } if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) { if (fetchDatum == null /*|| dbDatum == null*/ || parseText == null || parseData == null) { return; // only have inlinks } } else { if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) { return; // only have inlinks } } Document doc = new Document(); Metadata metadata = parseData.getContentMeta(); if (metadata.get(Nutch.SEGMENT_NAME_KEY) == null || metadata.get(Nutch.SIGNATURE_KEY) == null) { LOG.error("Metadata empty:" + key + " " + parseData.toString()); return; } // add segment, used to map from merged index back to segment files doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY), Field.Store.YES, Field.Index.NO)); // add digest, used by dedup doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY), Field.Store.YES, Field.Index.NO)); Parse parse = new ParseImpl(parseText, parseData); try { // run indexing filters doc = this.filters.filter(doc, parse, (Text) key, fetchDatum, inlinks); } catch (IndexingException e) { if (LOG.isWarnEnabled()) { LOG.warn("Error indexing " + key + ": " + e); } return; } float boost = 1.0f; // run scoring filters if (dbDatum != null || !collectionType.equals(Global.COLLECTION_TYPE_TREC)) { try { boost = this.scfilters.indexerScore((Text) key, doc, dbDatum, fetchDatum, parse, inlinks, boost); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn("Error calculating score " + key + ": " + e); } return; } } // apply boost to all indexed fields. // doc.setBoost(boost); - it uses the default 1.0f. if set, all fields will have this value boosted // store boost for use by explain and dedup doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO)); doc.add(new Field("inlinks", (inlinks == null) ? "0" : Integer.toString(inlinks.size()), Field.Store.YES, Field.Index.NO)); doc.add(new Field("outlinks", (parseData.getOutlinks() == null) ? "0" : Integer.toString(parseData.getOutlinks().length), Field.Store.YES, Field.Index.NO)); doc.add(new Field("pagerank", (pagerank == null) ? "0" : Float.toString(pagerank), Field.Store.YES, Field.Index.NO)); output.collect(key, new ObjectWritable(doc)); }
From source file:org.apache.nutch.indexer.Indexer.java
License:Apache License
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { output.collect(key, new ObjectWritable(value)); }