Example usage for org.apache.lucene.index IndexReader document

List of usage examples for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException 

Source Link

Document

Returns the stored fields of the nth Document in this index.

Usage

From source file:com.mathworks.xzheng.advsearching.CategorizerTest.java

License:Apache License

private void buildCategoryVectors() throws IOException {
    IndexReader reader = IndexReader.open(TestUtil.getBookIndexDirectory());

    int maxDoc = reader.maxDoc();

    for (int i = 0; i < maxDoc; i++) {
        if (!reader.document(i)) {
            Document doc = reader.document(i);
            String category = doc.get("category");

            Map vectorMap = (Map) categoryMap.get(category);
            if (vectorMap == null) {
                vectorMap = new TreeMap();
                categoryMap.put(category, vectorMap);
            }//from   w  ww  .  java 2s  . c o m

            Terms terms = reader.getTermVector(i, "subject");

            addTermFreqToMap(vectorMap, terms);
        }
    }
}

From source file:com.mathworks.xzheng.advsearching.FunctionQueryTest.java

License:Apache License

public void testRecency() throws Throwable {
    Directory dir = TestUtil.getBookIndexDirectory();
    IndexReader r = IndexReader.open(dir);
    IndexSearcher s = new IndexSearcher(r);
    s.setDefaultFieldSortScoring(true, true);

    QueryParser parser = new QueryParser(Version.LUCENE_46, "contents",
            new StandardAnalyzer(Version.LUCENE_46));
    Query q = parser.parse("java in action"); // #A
    Query q2 = new RecencyBoostingQuery(q, // #B
            2.0, 2 * 365, "pubmonthAsDay");
    Sort sort = new Sort(new SortField[] { SortField.FIELD_SCORE, new SortField("title2", SortField.STRING) });
    TopDocs hits = s.search(q2, null, 5, sort);

    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Document doc = r.document(hits.scoreDocs[i].doc);
        System.out.println((1 + i) + ": " + doc.get("title") + ": pubmonth=" + doc.get("pubmonth") + " score="
                + hits.scoreDocs[i].score);
    }//w  w  w  .  j a  v  a 2 s . com
    s.close();
    r.close();
    dir.close();
}

From source file:com.mathworks.xzheng.tools.BooksMoreLikeThis.java

License:Apache License

public static void main(String[] args) throws Throwable {

    String indexDir = System.getProperty("index.dir");
    FSDirectory directory = FSDirectory.open(new File(indexDir));
    IndexReader reader = IndexReader.open(directory);

    IndexSearcher searcher = new IndexSearcher(reader);

    int numDocs = reader.maxDoc();

    MoreLikeThis mlt = new MoreLikeThis(reader); // #A
    mlt.setFieldNames(new String[] { "title", "author" });
    mlt.setMinTermFreq(1); // #B
    mlt.setMinDocFreq(1);/*from  w w w.  j  a v  a 2 s .  c om*/

    for (int docID = 0; docID < numDocs; docID++) { // #C
        System.out.println();
        Document doc = reader.document(docID);
        System.out.println(doc.get("title"));

        Query query = mlt.like(docID); // #D
        System.out.println("  query=" + query);

        TopDocs similarDocs = searcher.search(query, 10);
        if (similarDocs.totalHits == 0)
            System.out.println("  None like this");
        for (int i = 0; i < similarDocs.scoreDocs.length; i++) {
            if (similarDocs.scoreDocs[i].doc != docID) { // #E
                doc = reader.document(similarDocs.scoreDocs[i].doc);
                System.out.println("  -> " + doc.getField("title").stringValue());
            }
        }
    }

    reader.close();
    directory.close();
}

From source file:com.redhat.satellite.search.index.IndexManager.java

License:Open Source License

/**
 * Removes any documents which are not related to the passed in Set of good value
 * @param ids Set of ids of all known/good values
 * @param indexName index name to operate on
 * @param uniqField the name of the field in the Document to uniquely identify
 * this record// w w w. ja v a 2  s. co m
 * @return the number of documents deleted
 */
public int deleteRecordsNotInList(Set<String> ids, String indexName, String uniqField) {
    int count = 0;
    IndexReader reader = null;
    try {
        reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG);

        // Use maxDoc() to iterate over all docs, numDocs() returns the
        // number of currently alive docs leaving out the deleted ones.
        int maxDoc = reader.maxDoc();
        for (int i = 0; i < maxDoc; i++) {
            if (!reader.isDeleted(i)) {
                Document doc = reader.document(i);
                String uniqId = doc.getField(uniqField).stringValue();
                if (!ids.contains(uniqId)) {
                    log.info(indexName + ":" + uniqField + ":  <" + uniqId
                            + "> not found in list of current/good values "
                            + "assuming this has been deleted from Database and we " + "should remove it.");
                    removeFromIndex(indexName, uniqField, uniqId);
                    count++;
                }
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
        log.info("deleteRecordsNotInList() caught exception : " + e);
    } catch (IndexingException e) {
        e.printStackTrace();
        log.info("deleteRecordsNotInList() caught exception : " + e);
    } finally {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                //
            }
        }
    }
    return count;
}

From source file:com.revorg.goat.Document.java

License:Open Source License

/**
 * Returns the list of fields for a particular Document.
 * @param indexPath Directory that contains the Lucene Collection
 * @throws Exception/*from ww w .jav a  2 s. c  o  m*/
 * @return ActionResult
 */
public static List getDocumentFields(String indexPath) {
    //Assign Document to Lucene Document
    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

    try {

        IndexReader reader = IndexReader.open(indexPath);
        doc = reader.document(0);
        reader.close();
        List AllTheFields = doc.getFields();
        return AllTheFields;

    } catch (Exception e) {
        ActionResultError = " caught a " + e.getClass() + " with message: " + e.getMessage();
        System.out.println("Failure on getDocumentFields ");
    }
    ActionResult = "Failure";
    return new LinkedList();
}

From source file:com.searchbox.SuggeterDataStructureBuilder.java

License:Apache License

private void iterateThroughDocuments(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) {
    IndexReader reader = searcher.getIndexReader();
    // WARNING: returns null if there are no deletions
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    maxNumDocs = Math.min(maxNumDocs, reader.maxDoc());

    if (maxNumDocs == -1) {
        maxNumDocs = reader.maxDoc();//from ww  w.j a v a 2s .co m
    }
    LOGGER.info("Analyzing docs:\t" + numdocs);

    for (int docID = 0; docID < reader.maxDoc(); docID++) {
        if (numdocs > maxNumDocs) {
            break;
        }
        if (liveDocs != null && !liveDocs.get(docID)) {
            continue; // deleted
        }

        if ((docID % 1000) == 0) {
            LOGGER.debug("Doing " + docID + " of " + maxNumDocs);
        }

        StringBuilder text = new StringBuilder();
        for (String field : fields) {
            /*
             * not sure if this is the best way, might make sense to do a
             * process text for each field individually, but then book
             * keeping the doc freq for terms becomes a bit of a pain in the
             * ass
             */
            try {
                IndexableField[] multifield = reader.document(docID).getFields(field);
                for (IndexableField singlefield : multifield) {
                    // create one big string from all of the text in the
                    // documents for processing later on
                    text.append(". " + singlefield.stringValue());
                }

            } catch (IOException ex) {
                LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring");
            }
        }
        // might as well see if its empty
        if (text.length() > 0) {
            // actually processes the massive string which was created from
            // all of the above fields
            processText(text.toString().toLowerCase());
            numdocs++;
        }
    }

    LOGGER.info("Number of documents analyzed: \t" + numdocs);
    for (int zz = 0; zz < counts.length; zz++) {
        LOGGER.info("Number of " + zz + "-grams: \t" + counts[zz]);
    }
}

From source file:com.searchbox.Tagger.java

License:Apache License

private void DfCountBuilder(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) {
    IndexReader reader = searcher.getIndexReader();
    Bits liveDocs = MultiFields.getLiveDocs(reader); // WARNING: returns null if
                                                     // there are no deletions

    maxNumDocs = Math.min(maxNumDocs, reader.maxDoc());

    if (maxNumDocs == -1) {
        maxNumDocs = reader.maxDoc();//from  w  w w  . j a v  a 2  s  .  c om
    }
    LOGGER.info("Analyzing docs:\t" + numdocs);

    for (int docID = 0; docID < reader.maxDoc(); docID++) {
        if (numdocs > maxNumDocs) {
            break;
        }
        if (liveDocs != null && !liveDocs.get(docID)) {
            continue; // deleted
        }

        if ((docID % 1000) == 0) {
            LOGGER.debug("Doing " + docID + " of " + maxNumDocs);
        }

        StringBuilder text = new StringBuilder();
        for (String field : fields) { // not sure if this is the best way, might
                                      // make sense to do a
                                      // process text for each field individually, but then book keeping
                                      // the doc freq for terms becomes a bit of a pain in the ass
            try {
                text.append(". " + reader.document(docID).get(field));
            } catch (IOException ex) {
                LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring");
            }
        }
        if (text.length() > 0) { // might as well see if its empty
            processDocText(text.toString());
            numdocs++;

        }
    }

    LOGGER.info("Number of documents analyzed: \t" + numdocs);
    dfcounts.put(DOC_COUNTS_STRING, numdocs);
    tfcounts.put(DOC_COUNTS_STRING, numdocs);
}

From source file:com.searchcode.app.service.TimeCodeSearcher.java

/**
 * Only used as fallback if getByRepoFileName fails for some reason due to what appears to be a lucene index bug
 * Using this is problematic because if the index is updated while this method is called it will possibly
 * return the incorrect result. We could add a shared lock between them both but that's hardly ideal especially
 * since when its called the index could already be updated
 *///from   w ww  .  ja v  a2 s. c  om
public CodeResult getById(int documentId) {
    CodeResult codeResult = null;

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH)));
        Document doc = reader.document(documentId);

        String filepath = doc.get(Values.PATH);

        List<String> code = new ArrayList<>();
        try {
            code = Files.readAllLines(Paths.get(filepath), StandardCharsets.UTF_8);
        } catch (Exception ex) {
            LOGGER.warning("Indexed file appears to binary: " + filepath);
        }

        codeResult = new CodeResult(code, null);
        codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME));
        codeResult.setFileName(doc.get(Values.FILENAME));
        codeResult.setLanguageName(doc.get(Values.LANGUAGENAME));
        codeResult.setMd5hash(doc.get(Values.MD5HASH));
        codeResult.setCodeLines(doc.get(Values.CODELINES));
        codeResult.setDocumentId(documentId);
        codeResult.setRepoName(doc.get(Values.REPONAME));
        codeResult.setRepoLocation(doc.get(Values.REPOLOCATION));
        codeResult.setCodeOwner(doc.get(Values.CODEOWNER));

        reader.close();
    } catch (Exception ex) {
        LOGGER.warning(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage());
    }

    return codeResult;
}

From source file:com.sun.socialsite.business.impl.LuceneSearchManagerImpl.java

License:Open Source License

/**
 * @return false if the index entry was not updated because it
 * was already current; true otherwise./* ww  w .j  a v  a 2s . co  m*/
 */
public boolean addToIndex(final App app) throws IOException {

    boolean needNewEntry = true;

    String key = getKey(app);
    String url = app.getURL().toExternalForm();
    String title = app.getTitle();
    String description = app.getDescription();

    IndexReader reader = IndexReader.open(indexDir);
    TermDocs termDocs = reader.termDocs(new Term("key", key));
    while (termDocs.next()) {
        Document existingDoc = reader.document(termDocs.doc());
        if (areEqual("app", existingDoc.get("class")) && areEqual(url, existingDoc.get("url"))
                && areEqual(title, existingDoc.get("title"))
                && areEqual(description, existingDoc.get("description"))) {
            needNewEntry = false;
        }
    }
    termDocs.close();
    reader.close();

    if (needNewEntry) {
        Document newDoc = new Document();
        newDoc.add(new Field("key", key, Field.Store.YES, Field.Index.UN_TOKENIZED));
        newDoc.add(new Field("class", "app", Field.Store.YES, Field.Index.UN_TOKENIZED));
        newDoc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED));
        if (title != null)
            newDoc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED));
        if (description != null)
            newDoc.add(new Field("description", description, Field.Store.YES, Field.Index.TOKENIZED));

        IndexWriter writer = null;
        try {
            writer = new IndexWriter(indexDir, analyzer, false);
            writer.deleteDocuments(new Term("key", key)); // Delete old entry, if present
            writer.addDocument(newDoc);
        } finally {
            if (writer != null)
                try {
                    writer.close();
                } catch (Exception e) {
                }
            ;
        }

        log.trace(String.format("Indexed app[url=%s,title=%s,description=%s]", url, title, description));
    }

    return needNewEntry;
}

From source file:com.sun.socialsite.business.impl.LuceneSearchManagerImpl.java

License:Open Source License

/**
 * @return false if the index entry was not updated because it
 * was already current; true otherwise.//from  w  ww .  j ava  2s  .  co  m
 */
public boolean addToIndex(final Group group) throws IOException {

    boolean needNewEntry = true;

    String key = getKey(group);
    String handle = group.getHandle();
    String name = group.getName();
    String description = group.getDescription();

    IndexReader reader = IndexReader.open(indexDir);
    TermDocs termDocs = reader.termDocs(new Term("key", key));
    while (termDocs.next()) {
        Document existingDoc = reader.document(termDocs.doc());
        if (areEqual("group", existingDoc.get("class")) && areEqual(handle, existingDoc.get("handle"))
                && areEqual(name, existingDoc.get("name"))
                && areEqual(description, existingDoc.get("description"))) {
            needNewEntry = false;
        }
    }
    termDocs.close();
    reader.close();

    if (needNewEntry) {
        Document newDoc = new Document();
        newDoc.add(new Field("key", key, Field.Store.YES, Field.Index.UN_TOKENIZED));
        newDoc.add(new Field("class", "group", Field.Store.YES, Field.Index.UN_TOKENIZED));
        newDoc.add(new Field("handle", handle, Field.Store.YES, Field.Index.TOKENIZED));
        newDoc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));
        if (description != null)
            newDoc.add(new Field("description", description, Field.Store.YES, Field.Index.TOKENIZED));

        IndexWriter writer = null;
        try {
            writer = new IndexWriter(indexDir, analyzer, false);
            writer.deleteDocuments(new Term("key", key)); // Delete old entry, if present
            writer.addDocument(newDoc);
        } finally {
            if (writer != null)
                try {
                    writer.close();
                } catch (Exception e) {
                }
            ;
        }

        log.trace(String.format("Indexed group[handle=%s,name=%s,description=%s]", name, handle, description));
    }

    return needNewEntry;
}