org.archive.nutchwax.tools.DateAdder.java Source code

Introduction

Here is the source code for org.archive.nutchwax.tools.DateAdder.java
Source

/*
 * Copyright (C) 2008 Internet Archive.
 * 
 * This file is part of the archive-access tools project
 * (http://sourceforge.net/projects/archive-access).
 * 
 * The archive-access tools are free software; you can redistribute them and/or
 * modify them under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or any
 * later version.
 * 
 * The archive-access tools are distributed in the hope that they will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License along with
 * the archive-access tools; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package org.archive.nutchwax.tools;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.WhitespaceAnalyzer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.apache.nutch.util.NutchConfiguration;

import org.archive.wayback.UrlCanonicalizer;

import org.archive.nutchwax.NutchWax;

/**
 * Reads series of (digest+URL,date) lines, finds corresponding
 * document in index, and adds the date to it.
 */
public class DateAdder extends Configured implements Tool {
    public int run(String[] args) throws Exception {
        if (args.length < 4) {
            System.out.println("DateAdder <key-index> <source1> ... <sourceN> <dest> <records>");
            System.exit(0);
        }

        String mainIndexDir = args[0].trim();
        String destIndexDir = args[args.length - 2].trim();
        String recordsFile = args[args.length - 1].trim();

        InputStream recordsStream;
        if ("-".equals(recordsFile)) {
            recordsStream = System.in;
        } else {
            recordsStream = new FileInputStream(recordsFile);
        }

        // Read date-addition records from stdin.
        Map<String, String> dateRecords = new HashMap<String, String>();
        BufferedReader br = new BufferedReader(new InputStreamReader(recordsStream, "UTF-8"));
        String line;
        while ((line = br.readLine()) != null) {
            String fields[] = line.split("\\s+");
            if (fields.length < 3) {
                System.out.println("Malformed line, not enough fields (" + fields.length + "): " + line);
                continue;
            }

            // Key is hash+url, value is String which is a " "-separated list of dates
            String key = fields[0] + fields[1];
            String dates = dateRecords.get(key);
            if (dates != null) {
                dates += " " + fields[2];
                dateRecords.put(key, dates);
            } else {
                dateRecords.put(key, fields[2]);
            }

        }

        IndexReader reader = IndexReader.open(mainIndexDir);

        IndexReader sourceReaders[] = new IndexReader[args.length - 3];
        for (int i = 0; i < sourceReaders.length; i++) {
            sourceReaders[i] = IndexReader.open(args[i + 1]);
        }

        IndexWriter writer = new IndexWriter(destIndexDir, new WhitespaceAnalyzer(), true);

        UrlCanonicalizer canonicalizer = getCanonicalizer(this.getConf());

        for (int i = 0; i < reader.numDocs(); i++) {
            Document oldDoc = reader.document(i);
            Document newDoc = new Document();

            // Copy the values from all the source indices to the new
            // document.
            Set<String> uniqueDates = new HashSet<String>();
            for (IndexReader source : sourceReaders) {
                Document sourceDoc = source.document(i);

                String dates[] = sourceDoc.getValues(NutchWax.DATE_KEY);

                Collections.addAll(uniqueDates, dates);
            }
            for (String date : uniqueDates) {
                newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }

            // Obtain the new dates for the document.
            String newDates = null;
            try {
                // First, apply URL canonicalization from Wayback
                String canonicalizedUrl = canonicalizer.urlStringToKey(oldDoc.get(NutchWax.URL_KEY));

                // Now, get the digest+URL of the document, look for it in
                // the updateRecords and if found, add the date.
                String key = canonicalizedUrl + oldDoc.get(NutchWax.DIGEST_KEY);

                newDates = dateRecords.get(key);
            } catch (Exception e) {
                // The canonicalizer can throw various types of exceptions
                // due to malformed URIs.
                System.err.println("WARN: Not adding dates on malformed URI: " + oldDoc.get(NutchWax.URL_KEY));
            }

            // If there are any new dates, add them to the new document.
            if (newDates != null) {
                for (String date : newDates.split("\\s+")) {
                    newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
                }
            }

            // Finally, add the new document to the new index.
            writer.addDocument(newDoc);
        }

        reader.close();
        writer.close();

        return 0;
    }

    /**
     * Utility function to instantiate a UrlCanonicalizer based on an
     * implementation specified in the configuration.
     */
    public static UrlCanonicalizer getCanonicalizer(Configuration conf) {
        // Which Wayback canonicalizer to use: Aggressive, Identity, etc.
        String canonicalizerClassName = conf.get("nutchwax.urlfilter.wayback.canonicalizer");

        if (canonicalizerClassName == null || canonicalizerClassName.trim().length() == 0) {
            throw new RuntimeException("Missing value for property: nutchwax.urlfilter.wayback.canonicalizer");
        }

        try {
            UrlCanonicalizer canonicalizer = (UrlCanonicalizer) Class.forName(canonicalizerClassName).newInstance();

            return canonicalizer;
        } catch (Exception e) {
            // If we can't instantiate it, there's not much else we can do
            // other than just throw the Exception.
            throw new RuntimeException(e);
        }
    }

    /**
     * Command-line driver.  Runs the Importer as a Hadoop job.
     */
    public static void main(String args[]) throws Exception {
        int result = ToolRunner.run(NutchConfiguration.create(), new DateAdder(), args);

        System.exit(result);
    }

}