Example usage for org.apache.lucene.index IndexReader document

List of usage examples for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException 

Source Link

Document

Returns the stored fields of the nth Document in this index.

Usage

From source file:com.sun.socialsite.business.impl.LuceneSearchManagerImpl.java

License:Open Source License

/**
 * @return false if the index entry was not updated because it
 * was already current; true otherwise./* w w w  .  j av a  2  s. co m*/
 */
public boolean addToIndex(final Profile profile) throws IOException {

    boolean needNewEntry = true;

    String key = getKey(profile);
    String userId = profile.getUserId();
    String firstName = profile.getFirstName();
    String middleName = profile.getMiddleName();
    String lastName = profile.getLastName();
    String nickName = profile.getNickName();
    String primaryEmail = profile.getPrimaryEmail();
    String displayName = profile.getDisplayName();

    IndexReader reader = IndexReader.open(indexDir);
    TermDocs termDocs = reader.termDocs(new Term("key", key));
    while (termDocs.next()) {
        Document existingDoc = reader.document(termDocs.doc());
        if (areEqual("profile", existingDoc.get("class")) && areEqual(userId, existingDoc.get("userId"))
                && areEqual(firstName, existingDoc.get("firstName"))
                && areEqual(middleName, existingDoc.get("middleName"))
                && areEqual(lastName, existingDoc.get("lastName"))
                && areEqual(nickName, existingDoc.get("nickName"))
                && areEqual(primaryEmail, existingDoc.get("primaryEmail"))
                && areEqual(displayName, existingDoc.get("displayName"))) {
            needNewEntry = false;
        }
    }
    termDocs.close();
    reader.close();

    if (needNewEntry) {
        Document newDoc = new Document();
        newDoc.add(new Field("key", key, Field.Store.YES, Field.Index.UN_TOKENIZED));
        newDoc.add(new Field("class", "profile", Field.Store.YES, Field.Index.UN_TOKENIZED));
        newDoc.add(new Field("userId", userId, Field.Store.YES, Field.Index.UN_TOKENIZED));
        if (firstName != null)
            newDoc.add(new Field("firstName", firstName, Field.Store.YES, Field.Index.TOKENIZED));
        if (middleName != null)
            newDoc.add(new Field("middleName", middleName, Field.Store.YES, Field.Index.TOKENIZED));
        if (lastName != null)
            newDoc.add(new Field("lastName", lastName, Field.Store.YES, Field.Index.TOKENIZED));
        if (nickName != null)
            newDoc.add(new Field("nickName", nickName, Field.Store.YES, Field.Index.TOKENIZED));
        if (primaryEmail != null)
            newDoc.add(new Field("primaryEmail", primaryEmail, Field.Store.YES, Field.Index.UN_TOKENIZED));
        if (displayName != null)
            newDoc.add(new Field("displayName", displayName, Field.Store.YES, Field.Index.TOKENIZED));

        IndexWriter writer = null;
        try {
            writer = new IndexWriter(indexDir, analyzer, false);
            writer.deleteDocuments(new Term("key", key)); // Delete old entry, if present
            writer.addDocument(newDoc);
        } finally {
            if (writer != null)
                try {
                    writer.close();
                } catch (Exception e) {
                }
            ;
        }

        log.trace(String.format(
                "Indexed profile[userId=%s,firstName=%s,lastName=%s,nickName=%s,primaryEmail=%s,displayName=%s]",
                userId, firstName, lastName, nickName, primaryEmail, displayName));
    }

    return needNewEntry;
}

From source file:com.tamingtext.classifier.bayes.ExtractTrainingData.java

License:Apache License

/**
 * Extract training data from a lucene index. 
 * <p>/*from  www. ja  v  a2 s  .co m*/
 * Iterates over documents in the lucene index, the values in the categoryFields are inspected and if found to 
 * contain any of the strings found in the category file, a training data item will be emitted, assigned to the
 * matching category and containing the terms found in the fields listed in textFields. Output is written to
 * the output directory with one file per category.
 * <p>
 * The category file contains one line per category, each line contains a number of whitespace delimited strings. 
 * The first string on each line is the category name, while subsequent strings will be used to identify documents
 * that belong in that category.
 * <p>
 * 'Technology Computers Macintosh' will cause documents that contain either 'Technology', 'Computers' or 'Machintosh'
 * in one of their categoryFields to be assigned to the 'Technology' category.
 * 
 * 
 * @param indexDir 
 *   directory of lucene index to extract from
 *   
 * @param maxDocs
 *   the maximum number of documents to process.
 *   
 * @param categoryFile
 *   file containing category strings to extract
 *   
 * @param categoryFields
 *   list of fields to match against category data
 *   
 * @param textFields
 *   list of fields containing terms to extract
 *   
 * @param outputDir
 *   directory to write output to
 *   
 * @throws IOException
 */
public static void extractTraininingData(File indexDir, File categoryFile, Collection<String> categoryFields,
        Collection<String> textFields, File outputDir, boolean useTermVectors) throws IOException {

    log.info("Index dir: " + indexDir);
    log.info("Category file: " + categoryFile);
    log.info("Output dir: " + outputDir);
    log.info("Category fields: " + categoryFields.toString());
    log.info("Text fields: " + textFields.toString());
    log.info("Use Term Vectors?: " + useTermVectors);
    OpenObjectIntHashMap<String> categoryCounts = new OpenObjectIntHashMap<String>();
    Map<String, List<String>> categories = readCategoryFile(categoryFile);

    Directory dir = FSDirectory.open(indexDir);
    IndexReader reader = IndexReader.open(dir, true);
    int max = reader.maxDoc();

    StringBuilder buf = new StringBuilder();

    for (int i = 0; i < max; i++) {
        if (!reader.isDeleted(i)) {
            Document d = reader.document(i);
            String category = null;

            // determine whether any of the fields in this document contain a 
            // category in the category list
            fields: for (String field : categoryFields) {
                for (Field f : d.getFields(field)) {
                    if (f.isStored() && !f.isBinary()) {
                        String fieldValue = f.stringValue().toLowerCase();
                        for (String cat : categories.keySet()) {
                            List<String> cats = categories.get(cat);
                            for (String c : cats) {
                                if (fieldValue.contains(c)) {
                                    category = cat;
                                    break fields;
                                }
                            }
                        }
                    }
                }
            }

            if (category == null)
                continue;

            // append the terms from each of the textFields to the training data for this document.
            buf.setLength(0);
            for (String field : textFields) {
                if (useTermVectors) {
                    appendVectorTerms(buf, reader.getTermFreqVector(i, field));
                } else {
                    appendFieldText(buf, d.getField(field));
                }
            }
            getWriterForCategory(outputDir, category).printf("%s\t%s\n", category, buf.toString());
            categoryCounts.adjustOrPutValue(category, 1, 1);
        }
    }

    if (log.isInfoEnabled()) {
        StringBuilder b = new StringBuilder();
        b.append("\nCatagory document counts:\n");
        LinkedList<String> keyList = new LinkedList<String>();
        categoryCounts.keysSortedByValue(keyList);
        String key;
        while (!keyList.isEmpty()) {
            key = keyList.removeLast();
            b.append(categoryCounts.get(key)).append('\t').append(key).append('\n');
        }
        log.info(b.toString());
    }
}

From source file:com.tamingtext.classifier.mlt.MoreLikeThisQueryTest.java

License:Apache License

@Test
public void testMoreLikeThisQuery() throws Exception {
    //<start id="lucene.examples.mlt.setup"/>
    Directory directory = FSDirectory.open(new File(modelPath));

    IndexReader indexReader = IndexReader.open(directory); //<co id="mlt.indexsetup"/>
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);

    Analyzer analyzer //<co id="mlt.analyzersetup"/>
            = new EnglishAnalyzer(Version.LUCENE_36);

    if (nGramSize > 1) { //<co id="mlt.ngramsetup"/>
        analyzer = new ShingleAnalyzerWrapper(analyzer, nGramSize, nGramSize);
    }/*  w w w  .jav a 2s.c  om*/

    MoreLikeThis moreLikeThis = new MoreLikeThis(indexReader); //<co id="mlt.configure"/>
    moreLikeThis.setAnalyzer(analyzer);
    moreLikeThis.setFieldNames(new String[] { "content" });

    /*<calloutlist>
    <callout arearefs="mlt.indexsetup">Open Index</callout>
    <callout arearefs="mlt.analyzersetup">Setup Analyzer</callout>
    <callout arearefs="mlt.ngramsetup">Setup NGrams</callout>
    <callout arearefs="mlt.configure">Create <classname>MoreLikeThis</classname></callout>
    </calloutlist>*/
    //<end id="lucene.examples.mlt.setup"/>

    // for testing against the same corpus
    moreLikeThis.setMinTermFreq(1);
    moreLikeThis.setMinDocFreq(1);

    //<start id="lucene.examples.mlt.query"/>
    Reader reader = new FileReader(inputPath); //<co id="mlt.query"/>
    Query query = moreLikeThis.like(reader);

    TopDocs results = indexSearcher.search(query, maxResults); //<co id="mlt.search"/>

    HashMap<String, CategoryHits> categoryHash = new HashMap<String, CategoryHits>();

    for (ScoreDoc sd : results.scoreDocs) { //<co id="mlt.collect"/>
        Document d = indexReader.document(sd.doc);
        Fieldable f = d.getFieldable(categoryFieldName);
        String cat = f.stringValue();
        CategoryHits ch = categoryHash.get(cat);
        if (ch == null) {
            ch = new CategoryHits();
            ch.setLabel(cat);
            categoryHash.put(cat, ch);
        }
        ch.incrementScore(sd.score);
    }

    SortedSet<CategoryHits> sortedCats //<co id="mlt.rank"/>
            = new TreeSet<CategoryHits>(CategoryHits.byScoreComparator());
    sortedCats.addAll(categoryHash.values());

    for (CategoryHits c : sortedCats) { //<co id="mlt.display"/>
        System.out.println(c.getLabel() + "\t" + c.getScore());
    }
    /*<calloutlist>
    <callout arearefs="mlt.query">Create Query</callout>
    <callout arearefs="mlt.search">Perform Search</callout>
    <callout arearefs="mlt.collect">Collect Results</callout>
    <callout arearefs="mlt.rank">Rank Categories</callout>
    <callout arearefs="mlt.display">Display Categories</callout>
    </calloutlist>*/
    //<end id="lucene.examples.mlt.query"/>

}

From source file:com.tamingtext.tagging.LuceneCategoryExtractor.java

License:Apache License

/** dump the values stored in the specified field for each document.
 * /*from   w  w w .  ja  v  a 2s.  c  o m*/
 *  <pre>term(tab)document_frequency</pre>
 *  
 * @param indexDir the index to read.
 * @param field the name of the field.
 * @param out the print writer output will be written to
 * @throws IOException
 */
public static void dumpDocumentFields(File indexDir, String field, long maxDocs, PrintWriter out)
        throws IOException {
    Directory dir = FSDirectory.open(indexDir);
    IndexReader reader = IndexReader.open(dir, true);
    int max = reader.maxDoc();
    for (int i = 0; i < max; i++) {
        if (!reader.isDeleted(i)) {
            Document d = reader.document(i);
            for (Field f : d.getFields(field)) {
                if (f.isStored() && !f.isBinary()) {
                    String value = f.stringValue();
                    if (value != null) {
                        out.printf("%s\n", value);
                    }
                }
            }
        }
    }
}

From source file:com.tinkerpop.graph.benchmark.index.LuceneKeyToNodeIdIndexImpl.java

License:Apache License

@Override
public long getGraphNodeId(String udk) {
    Long result = hotCache.get(udk);
    if (result == null) {
        //fail fast on bloom
        int bloomKey = Math.abs(udk.hashCode() % bloomFilterSize);
        if (!bloomFilter.fastGet(bloomKey)) {
            //Not seen - fail
            bloomReadSaves++;// www.  java2 s. c  o m
            return -1;
        }
        result = uncommittedKeyBuffer.get(udk);
        if (result != null) {
            return result;
        }
        if (reader == null) {
            try {
                reader = IndexReader.open(dir, true);
                subreaders = reader.getSequentialSubReaders();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }
        try {
            Term searchTerm = term.createTerm(udk);
            for (IndexReader r : subreaders) {
                TermDocs td = r.termDocs(searchTerm);
                if (td.next()) {
                    Document doc = r.document(td.doc());
                    result = Long.parseLong(doc.get("id"));
                    hotCache.put(udk, result);
                    successfulLuceneReads++;
                    return result;
                }
            }
            failedLuceneReads++;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    } else {
        hotCacheHits++;
    }
    if (result == null) {
        return -1;
    } else {
        return result;
    }
}

From source file:com.zimbra.cs.index.RawIndexEditor.java

License:Open Source License

void dumpAll() throws IOException {
    IndexReader reader = IndexReader.open(luceneDirectory);
    try {//from ww  w  .  j a  v  a 2  s  .  c  o m
        int maxDoc = reader.maxDoc();
        System.out.println("There are " + maxDoc + " documents in this index.");

        for (int i = 0; i < maxDoc; i++) {
            dumpDocument(reader.document(i), reader.isDeleted(i));
        }
    } finally {
        reader.close();
    }
}

From source file:com.zimbra.cs.rmgmt.RemoteMailQueue.java

License:Open Source License

private void list0(SearchResult result, IndexReader indexReader, int offset, int limit) throws IOException {
    if (ZimbraLog.rmgmt.isDebugEnabled()) {
        ZimbraLog.rmgmt.debug("listing offset=" + offset + " limit=" + limit + " " + this);
    }// w  w  w .  j ava  2s .  c o m
    int max = indexReader.maxDoc();

    int skip = 0;
    int listed = 0;

    for (int i = 0; i < max; i++) {
        if (indexReader.isDeleted(i)) {
            continue;
        }

        if (skip < offset) {
            skip++;
            continue;
        }

        Document doc = indexReader.document(i);
        Map<QueueAttr, String> qitem = docToQueueItem(doc);
        result.qitems.add(qitem);

        listed++;
        if (listed == limit) {
            break;
        }
    }
    result.hits = getNumMessages();
}

From source file:db.infiniti.config.HighFreqTerms.java

License:Apache License

private TermStats[] getLowerHigherEqualSpecificFreqTerms(IndexReader reader, int numTerms, String field,
        ArrayList<String> sentQueries, int specificFreq, boolean allranges) throws Exception {
    // TODO Auto-generated method stub
    TermInfoLowerFreqThanX tiqLower = new TermInfoLowerFreqThanX(numTerms, specificFreq);
    TermInfoHigherFreqThanX tiqHigher = new TermInfoHigherFreqThanX(numTerms, specificFreq);
    TermInfoEqualFreqThanX tiqEqual = new TermInfoEqualFreqThanX(numTerms, specificFreq);
    if (field != null) {
        Document aDoc = reader.document(0);
        // reader.getTermFreqVector(0, field);
        TermEnum terms = reader.terms(new Term(field));

        do {//from   w  w  w. ja  v a  2  s.  c  om
            if (terms != null && terms.term() != null) {
                String textOfTerm = terms.term().text();

                if (terms.term().field().equals(field)) {
                    if (!textEditor.isRefinedQueryStopWordLength(textOfTerm)
                            && !sentQueries.contains(textOfTerm)) {
                        tiqLower.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
                        tiqHigher.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
                        if (terms.docFreq() == specificFreq) {
                            tiqEqual.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
                        }
                    }
                }
            }
        } while (terms.next());

    } else {
        TermEnum terms = reader.terms();
        while (terms.next()) {
            String textOfTerm = terms.term().text();
            if (!textEditor.isRefinedQueryStopWordLength(textOfTerm) && !sentQueries.contains(textOfTerm)) {
                tiqLower.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
                tiqHigher.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
                if (terms.docFreq() == specificFreq) {
                    tiqEqual.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
                }
            }
        }
    }
    TermStats[] result;
    int count;
    if (allranges) {
        result = new TermStats[tiqLower.size() + tiqHigher.size() + tiqEqual.size()];
        count = tiqHigher.size() - 1 + tiqLower.size() + tiqEqual.size();
        while (tiqHigher.size() != 0) {
            result[count] = (TermStats) tiqHigher.pop();
            count--;
        }
        count = tiqLower.size() + tiqEqual.size() - 1;
        while (tiqEqual.size() != 0) {
            result[count] = (TermStats) tiqEqual.pop();
            count--;
        }
        count = tiqLower.size() - 1;
        while (tiqLower.size() != 0) {
            result[count] = (TermStats) tiqLower.pop();
            count--;
        }
    } else {
        result = new TermStats[tiqEqual.size()];
        count = tiqEqual.size() - 1;
        while (tiqEqual.size() != 0) {
            result[count] = (TermStats) tiqEqual.pop();
            count--;
        }
    }

    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards

    return result;
}

From source file:db.infiniti.config.HighFreqTerms.java

License:Apache License

/**
 * // w  w w. j  a v  a 2 s  .  c  o  m
 * @param reader
 * @param numTerms
 * @param field
 * @param sentQueries
 * @param initialQuery 
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field,
        ArrayList<String> sentQueries, List<String> initialQuery) throws Exception {

    TermInfoWiTFQueueForHighFreq tiq = new TermInfoWiTFQueueForHighFreq(numTerms);
    if (field != null) {
        Document aDoc = reader.document(0);
        // reader.getTermFreqVector(0, field);
        TermEnum terms = reader.terms(new Term(field));

        do {
            if (terms != null && terms.term() != null) {
                String textOfTerm = terms.term().text();

                if (terms.term().field().equals(field)) {
                    if (!textEditor.isRefinedQueryStopWordLength(textOfTerm)
                            && !sentQueries.contains(textOfTerm) && !initialQuery.contains(textOfTerm)) {
                        tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
                    }
                }
            }
        } while (terms.next());

    } else {
        TermEnum terms = reader.terms();
        while (terms.next()) {
            String textOfTerm = terms.term().text();
            if (!textEditor.isRefinedQueryStopWordLength(textOfTerm) && !sentQueries.contains(textOfTerm)
                    && !initialQuery.contains(textOfTerm)) {
                tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];

    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = (TermStats) tiq.pop();
        count--;
    }
    return result;
}

From source file:de.dkt.eservices.elucene.indexmanagement.SearchFiles.java

License:Apache License

/**
 * Searches a query against a field of an index and return hitsToReturn documents.
 * @param index index where to search for the query text
 * @param field document field against what to match the query
 * @param queryString text of the input query
 * @param hitsToReturn number of documents to be returned
 * @return JSON format string containing the results information and content
 * @throws ExternalServiceFailedException
 *///from   w ww .j a va 2s.  com
public static JSONObject search(String index, String sFields, String sAnalyzers, String queryType,
        String queryString, String language, int hitsToReturn) throws ExternalServiceFailedException {
    try {
        //         System.out.println(index+"__"+sFields+"__"+sAnalyzers+"__"+queryType+"__"+language+"__"+hitsToReturn);
        //         System.out.println(indexDirectory);
        Date start = new Date();

        File f = FileFactory.generateFileInstance(indexDirectory + index);
        if (f == null || !f.exists()) {
            throw new ExternalServiceFailedException(
                    "Specified index [" + indexDirectory + index + "] does not exists.");
        }
        logger.info("Searching in folder: " + f.getAbsolutePath());
        Directory dir = FSDirectory.open(f);
        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);

        //         System.out.println(reader.docFreq(new Term("content", "madrid")));

        Document doc = reader.document(0);
        //         System.out.println(reader.numDocs());
        //         System.out.println(doc);

        String[] fields = sFields.split(";");
        String[] analyzers = sAnalyzers.split(";");
        if (fields.length != analyzers.length) {
            logger.error("The number of fields and analyzers is different");
            throw new BadRequestException("The number of fields and analyzers is different");
        }

        //System.out.println("CHECK IF THE QUERY IS WORKING PROPERLY: "+queryString);
        Query query = OwnQueryParser.parseQuery(queryType, queryString, fields, analyzers, language);

        //System.out.println("\t QUERY: "+query);

        TopDocs results = searcher.search(query, hitsToReturn);

        Explanation exp = searcher.explain(query, 0);
        //         System.out.println("EXPLANATION: "+exp);

        //         System.out.println("TOTAL HITS: " + results.totalHits);

        Date end = new Date();
        logger.info("Time: " + (end.getTime() - start.getTime()) + "ms");
        //         System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");

        JSONObject resultModel = JSONLuceneResultConverter.convertResults(query, searcher, results);
        reader.close();
        return resultModel;
    } catch (IOException e) {
        e.printStackTrace();
        throw new ExternalServiceFailedException("IOException with message: " + e.getMessage());
    }
}