org.archive.nutchwax.tools.LengthNormUpdater.java Source code

Introduction

Here is the source code for org.archive.nutchwax.tools.LengthNormUpdater.java
Source

package org.archive.nutchwax.tools;

/**
 * Copyright 2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Collection;
import java.util.HashSet;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import org.apache.nutch.indexer.NutchSimilarity;

/**
 * This is heavily cribbed from org.apache.lucene.misc.LengthNormModifier
 */
public class LengthNormUpdater {
    private static final String USAGE = "Usage: LengthNormUpdater [OPTIONS] <pageranks> <index> [field1]...\n"
            + "\n" + "Update the norms of <index> with boosts based on values from <pageranks>\n" + "\n"
            + "Options:\n" + "\t-s <classname>    similarity implementation to use\n"
            + "\t-v                increase verbosity\n" + "\n"
            + "Reads the pagerank values from the <pageranks> file and calculates new\n"
            + "norms for the documents based on the formula:\n" + "\n"
            + "\tnorm = similarity.lengthNorm * log10(pagerank)\n" + "\n"
            + "If fields are specified on the command-line, only they will be updated.\n"
            + "If a specified field does not have norms, an error message is given and\n"
            + "the program terminates without performing any updates.\n" + "\n"
            + "If no fields are given, all the fields in the index that have norms will\n" + "be updated.\n" + "\n"
            + "The default similarity implementation is NutchSimilarity\n" + "\n" + "Examples:\n" + "\n"
            + "\tLengthNormUpdater pagerank.txt index\n"
            + "\tLengthNormUpdater -v -v pagerank.txt index title content\n" + "\n";

    private static int VERBOSE = 0;

    /**
     *
     */
    public static void main(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.print(USAGE);
            System.exit(1);
        }

        Similarity s = new NutchSimilarity();

        int pos = 0;
        for (; (pos < args.length) && args[pos].startsWith("-"); pos++) {
            if ("-h".equals(args[pos])) {
                System.out.println(USAGE);
                System.exit(0);
            } else if ("-v".equals(args[pos])) {
                VERBOSE++;
            } else if ("-s".equals(args[pos])) {
                pos++;

                if (pos == args.length) {
                    System.err.println("Error: missing argument to option -s");
                    System.exit(1);
                }

                try {
                    Class simClass = Class.forName(args[pos]);
                    s = (Similarity) simClass.newInstance();
                } catch (Exception e) {
                    System.err.println("Couldn't instantiate similarity with empty constructor: " + args[pos]);
                    e.printStackTrace(System.err);
                    System.exit(1);
                }
            }
        }

        if ((pos + 2) > args.length) {
            System.out.println(USAGE);
            System.exit(1);
        }

        String pagerankFile = args[pos++];

        IndexReader reader = IndexReader.open(args[pos++]);

        try {
            Set<String> fieldNames = new HashSet<String>();
            if (pos == args.length) {
                // No fields specified on command-line, get a list of all
                // fields in the index that have norms.
                for (String fieldName : (Collection<String>) reader.getFieldNames(IndexReader.FieldOption.ALL)) {
                    if (reader.hasNorms(fieldName)) {
                        fieldNames.add(fieldName);
                    }
                }
            } else {
                // Verify all explicitly specified fields have norms.
                for (int i = pos; i < args.length; i++) {
                    if (!reader.hasNorms(args[i])) {
                        System.err.println("Error: No norms for field: " + args[i]);
                        System.exit(1);
                    }

                    fieldNames.add(args[i]);
                }
            }

            if (fieldNames.isEmpty()) {
                System.out.println("Warning: No fields with norms to update");
                System.exit(0);
            }

            Map<String, Integer> ranks = getPageRanks(pagerankFile);

            for (String fieldName : fieldNames) {
                reSetNorms(reader, fieldName, ranks, s);
            }

        } finally {
            if (reader != null) {
                reader.close();
            }

        }
    }

    /**
     *
     */
    public static void reSetNorms(IndexReader reader, String fieldName, Map<String, Integer> ranks, Similarity sim)
            throws IOException {
        if (VERBOSE > 0)
            System.out.println("Updating field: " + fieldName);

        int[] termCounts = new int[0];

        TermEnum termEnum = null;
        TermDocs termDocs = null;

        termCounts = new int[reader.maxDoc()];
        try {
            termEnum = reader.terms(new Term(fieldName, ""));
            try {
                termDocs = reader.termDocs();
                do {
                    Term term = termEnum.term();
                    if (term != null && term.field().equals(fieldName)) {
                        termDocs.seek(termEnum.term());
                        while (termDocs.next()) {
                            termCounts[termDocs.doc()] += termDocs.freq();
                        }
                    }
                } while (termEnum.next());
            } finally {
                if (null != termDocs)
                    termDocs.close();
            }
        } finally {
            if (null != termEnum)
                termEnum.close();
        }

        for (int d = 0; d < termCounts.length; d++) {
            if (!reader.isDeleted(d)) {
                Document doc = reader.document(d);

                String url = doc.get("url");

                if (url != null) {
                    Integer rank = ranks.get(url);
                    if (rank == null)
                        continue;

                    float originalNorm = sim.lengthNorm(fieldName, termCounts[d]);
                    byte encodedOrig = sim.encodeNorm(originalNorm);
                    float rankedNorm = originalNorm * (float) (Math.log10(rank) + 1);
                    byte encodedRank = sim.encodeNorm(rankedNorm);

                    if (VERBOSE > 1)
                        System.out.println(fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t"
                                + rankedNorm + "\t" + encodedRank);

                    reader.setNorm(d, fieldName, encodedRank);
                }
            }
        }
    }

    /**
     * Utility function to read a list of page-rank records from a file
     * specified in the configuration.
     */
    public static Map<String, Integer> getPageRanks(String filename) {
        if (VERBOSE > 0)
            System.out.println("Reading pageranks from: " + filename);

        Map<String, Integer> pageranks = new HashMap<String, Integer>();

        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));

            String line;
            while ((line = reader.readLine()) != null) {
                String fields[] = line.split("\\s+");

                if (fields.length < 2) {
                    System.err.println("Malformed pagerank, not enough fields (" + fields.length + "): " + line);
                    continue;
                }

                try {
                    int rank = Integer.parseInt(fields[0]);
                    String url = fields[1];

                    if (rank < 0) {
                        System.err.println("Malformed pagerank, rank less than 0: " + line);
                    }

                    pageranks.put(url, rank);
                } catch (NumberFormatException nfe) {
                    System.err.println("Malformed pagerank, rank not an integer: " + line);
                    continue;
                }
            }
        } catch (IOException e) {
            // Umm, what to do?
            throw new RuntimeException(e);
        } finally {
            try {
                if (reader != null) {
                    reader.close();
                }
            } catch (IOException e) {
                // Ignore it.
            }
        }

        return pageranks;
    }

}