Java tutorial
/** * Copyright 2009 Welocalize, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.globalsight.ling.lucene; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001,2004 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ import java.io.File; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** * <code>HighFreqTerms</code> class extracts terms and their * frequencies out of an existing Lucene index. * * @version $Id: HighFreqTerms.java,v 1.2 2013/09/13 06:14:27 wayne Exp $ */ public class HighFreqTerms { // The top numTerms will be displayed public static final int numTerms = 100; public static void main(String[] args) throws Exception { IndexReader reader = null; if (args.length == 1) { SimpleFSDirectory fsd = new SimpleFSDirectory(new File(args[0])); reader = DirectoryReader.open(fsd); } else { usage(); System.exit(1); } TermInfoQueue tiq = new TermInfoQueue(numTerms); //TODO: IS field right? String field = IndexDocument.TEXT; Terms terms = reader.getTermVector(0, field); //TermEnum terms = reader.terms(); TermsEnum termsEnum = terms.iterator(null); BytesRef next = null; while ((next = termsEnum.next()) != null) { tiq.insertWithOverflow(new TermInfo(new Term(field, termsEnum.term()), termsEnum.docFreq())); } while (tiq.size() != 0) { TermInfo termInfo = (TermInfo) tiq.pop(); System.out.println(termInfo.term + " " + termInfo.docFreq); } reader.close(); } private static void usage() { System.out.println("\n\n" + "java org.apache.lucene.misc.HighFreqTerms <index dir>\n\n"); } } final class TermInfo { TermInfo(Term t, int df) { term = t; docFreq = df; } int docFreq; Term term; } final class TermInfoQueue extends PriorityQueue { TermInfoQueue(int size) { super(size); } protected final boolean lessThan(Object a, Object b) { TermInfo termInfoA = (TermInfo) a; TermInfo termInfoB = (TermInfo) b; return termInfoA.docFreq < termInfoB.docFreq; } }