Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.searcher; import java.io.IOException; import java.io.File; import java.util.ArrayList; import java.util.Enumeration; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.FieldCache; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.MapFieldSelector; import org.apache.lucene.search.PwaFunctionsWritable; import org.apache.lucene.search.caches.PwaCacheManager; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.*; import org.apache.hadoop.conf.*; import org.apache.nutch.indexer.*; /** Implements {@link Searcher} and {@link HitDetailer} for either a single * merged index, or a set of indexes. */ public class IndexSearcher implements Searcher, HitDetailer { private org.apache.lucene.search.Searcher luceneSearcher; private org.apache.lucene.index.IndexReader reader; private LuceneQueryOptimizer optimizer; private FileSystem fs; private Configuration conf; private QueryFilters queryFilters; private PwaCacheManager cache; /** Construct given a number of indexes. */ public IndexSearcher(Path[] indexDirs, Configuration conf, File blacklistFile) throws IOException { IndexReader[] readers = new IndexReader[indexDirs.length]; this.conf = conf; this.fs = FileSystem.get(conf); for (int i = 0; i < indexDirs.length; i++) { readers[i] = IndexReader.open(getDirectory(indexDirs[i])); } init(new MultiReader(readers), conf, blacklistFile); } /** Construct given a single merged index. */ public IndexSearcher(Path index, Configuration conf, File blacklistFile) throws IOException { this.conf = conf; this.fs = FileSystem.get(conf); init(IndexReader.open(getDirectory(index)), conf, blacklistFile); } private void init(IndexReader reader, Configuration conf, File blacklistFile) throws IOException { this.reader = reader; this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader); this.luceneSearcher.setSimilarity(new NutchSimilarity()); this.optimizer = new LuceneQueryOptimizer(conf); this.queryFilters = new QueryFilters(conf); // read all caches cache = PwaCacheManager.getInstance(reader, blacklistFile); } private Directory getDirectory(Path file) throws IOException { if ("local".equals(this.fs.getName())) { return FSDirectory.getDirectory(file.toString(), false); } else { return new FsDirectory(this.fs, file, false, this.conf); } } public Hits search(Query query, int numHits, String dedupField, String sortField, boolean reverse) throws IOException { org.apache.lucene.search.BooleanQuery luceneQuery = this.queryFilters.filter(query); return translateHits(optimizer.optimize(luceneQuery, luceneSearcher, numHits, sortField, reverse), dedupField, sortField); } /** * @param searcherMaxHits maximum number of matched documents * @param maxHitsPerDup ignore this value necessary because of interface */ public Hits search(Query query, int numHits, int searcherMaxHits, int maxHitsPerDup, String dedupField, String sortField, boolean reverse, PwaFunctionsWritable functions, int maxHitsPerVersion) throws IOException { org.apache.lucene.search.BooleanQuery luceneQuery = this.queryFilters.filter(query); luceneQuery.setFunctions(functions); // set functions and boosts return translateHits( optimizer.optimize(luceneQuery, luceneSearcher, numHits, searcherMaxHits, sortField, reverse), dedupField, sortField); } public String getExplanation(Query query, Hit hit) throws IOException { return luceneSearcher.explain(this.queryFilters.filter(query), hit.getIndexDocNo()).toHtml(); } public String getExplanation(Query query, Hit hit, PwaFunctionsWritable functions) throws IOException { org.apache.lucene.search.BooleanQuery luceneQuery = this.queryFilters.filter(query); luceneQuery.setFunctions(functions); // set functions and boosts return luceneSearcher.explain(luceneQuery, hit.getIndexDocNo()).toHtml(); } public HitDetails[] getDetails(PwaRequestDetailsWritable details) throws IOException { Hit[] hits = details.getHits(); String[] fields = details.getFields(); HitDetails[] results = new HitDetails[hits.length]; for (int i = 0; i < hits.length; i++) results[i] = getDetails(hits[i], fields); return results; } public HitDetails[] getDetails(Hit[] hits) throws IOException { HitDetails[] results = new HitDetails[hits.length]; for (int i = 0; i < hits.length; i++) results[i] = getDetails(hits[i]); return results; } /* BUG wayback 0000155 */ public HitDetails getDetails(Hit hit) throws IOException { return getDetails(hit, null); } public HitDetails getDetails(Hit hit, String[] fieldNames) throws IOException { ArrayList fields = new ArrayList(); ArrayList values = new ArrayList(); // see if fields are in cache first if (fieldNames != null) { ArrayList<String> remainingFields = new ArrayList<String>(); int cachedFieldsRead = 0; for (int i = 0; i < fieldNames.length; i++) { Object obj = cache.getValue(fieldNames[i], hit.getIndexDocNo()); if (obj != null) { fields.add(fieldNames[i]); values.add(obj.toString()); cachedFieldsRead++; } else { remainingFields.add(fieldNames[i]); } } if (fieldNames.length == cachedFieldsRead) { // if has all fields in cache return return new HitDetails((String[]) fields.toArray(new String[fields.size()]), (String[]) values.toArray(new String[values.size()])); } fieldNames = remainingFields.toArray(new String[remainingFields.size()]); // else read from index the remaining fields } //Document doc = luceneSearcher.doc(hit.getIndexDocNo(), new MapFieldSelector(sfields)); Document doc = reader.document(hit.getIndexDocNo(), (fieldNames == null) ? null : new MapFieldSelector(fieldNames)); Enumeration e = doc.fields(); while (e.hasMoreElements()) { Field field = (Field) e.nextElement(); fields.add(field.name()); values.add(field.stringValue()); } return new HitDetails((String[]) fields.toArray(new String[fields.size()]), (String[]) values.toArray(new String[values.size()])); } private Hits translateHits(TopDocs topDocs, String dedupField, String sortField) throws IOException { String[] dedupValues = null; if (dedupField != null) { dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField); } ScoreDoc[] scoreDocs = topDocs.scoreDocs; int length = scoreDocs.length; Hit[] hits = new Hit[length]; for (int i = 0; i < length; i++) { WritableComparable sortValue = new FloatWritable(scoreDocs[i].score); String dedupValue = (dedupValues == null) ? null : dedupValues[scoreDocs[i].doc]; hits[i] = new Hit(scoreDocs[i].doc, sortValue, dedupValue); } return new Hits(topDocs.totalHits, hits); } public void close() throws IOException { if (luceneSearcher != null) { luceneSearcher.close(); } if (reader != null) { reader.close(); } } }