Example usage for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException

Source Link

Document

Returns the stored fields of the n^th Document in this index.

Usage

From source file:de.ingrid.interfaces.csw.index.impl.IngridGeoTKLuceneIndexer.java

License:EUPL

/**
 * This method remove documents identified by query from the index.
 * /*w  w w  .j av  a 2 s.c o m*/
 * @param query
 * @throws ParseException
 */
public List<String> removeDocumentByQuery(final String queryString) throws ParseException {
    List<String> deletedRecords = new ArrayList<String>();
    try {
        final QueryParser parser = new QueryParser(Version.LUCENE_36, "anytext", analyzer);

        Query query = parser.parse(queryString);

        final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        final IndexWriter writer = new IndexWriter(LuceneUtils.getAppropriateDirectory(getFileDirectory()),
                config);

        LOGGER.log(logLevel, "Query:{0}", query);

        IndexReader reader = IndexReader.open(writer, false);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs docs = searcher.search(query, Integer.MAX_VALUE);
        for (ScoreDoc doc : docs.scoreDocs) {
            deletedRecords.add(reader.document(doc.doc).get("id"));
        }
        writer.deleteDocuments(query);

        writer.commit();
        searcher.close();
        reader.close();
        writer.close();

    } catch (CorruptIndexException ex) {
        LOGGER.log(Level.WARNING, "CorruptIndexException while indexing document: " + ex.getMessage(), ex);
    } catch (IOException ex) {
        LOGGER.log(Level.WARNING, "IOException while indexing document: " + ex.getMessage(), ex);
    }
    return deletedRecords;
}

From source file:de.linguatools.disco.Compositionality.java

License:Apache License

/**
 * Find the most similar words in the DISCO word space for an input word 
 * vector. While the word vector can represent a multi-token word (if it was
 * produced by one of the methods /*from w  w w  . ja v a2s .  c  om*/
 * <code>Compositionality.composeWordVectors()</code>) the most
 * similar words will only be single-token words from the index.<br/>
 * <b>Warning</b>: This method is very time consuming and should only be
 * used with word spaces that have been loaded into memory!
 * @param wordvector input word vector
 * @param disco DISCO word space
 * @param simMeasure
 * @return List of all words (with their similarity values) whose similarity
 * with the <code>wordvector</code> is greater than zero, ordered by 
 * similarity value (highest value first).
 * @throws java.io.IOException
 */
public ArrayList<ReturnDataCol> similarWords(HashMap<String, Float> wordvector, DISCO disco,
        SimilarityMeasures simMeasure) throws IOException {

    // hole einen IndexReader fuer das indexDir
    IndexReader ir = disco.getIndexReader();

    // durchlaufe alle Dokumente
    ArrayList<ReturnDataCol> result = new ArrayList();
    for (int i = 0; i < ir.numDocs(); i++) {
        Document doc = null;
        try {
            doc = ir.document(i);
        } catch (CorruptIndexException ex) {
            continue;
        } catch (IOException ex) {
            continue;
        }
        // Wortvektor zu Wort Nr. i holen
        String word = doc.get("word");
        HashMap<String, Float> wv = getWordvector(word, disco);
        // hnlichkeit zwischen Wortvektoren berechnen
        float sim = semanticSimilarity(wordvector, wv, simMeasure);
        if (sim > 0.0F) {
            ReturnDataCol r = new ReturnDataCol(word, sim);
            result.add(r);
        }
    }

    // nach hchstem hnlichkeitswert sortieren
    Collections.sort(result, new ValueComparator());

    return result;
}

From source file:de.linguatools.disco.DISCO.java

License:Apache License

/***************************************************************************
 * Run trough all documents (i.e. queryable words) in the index, and retrieve
 * the word and its frequency. Write both informations to the file named
 * outputFileName. This method can be used to check index integrity.<br/>
 * @param outputFileName/*from w w w.j a  va2s  .c om*/
 * @return number of words written to the output file. In case of success the
 * value is equal to the number of words in the index.
 */
public int wordFrequencyList(String outputFileName) {

    // erzeuge einen IndexReader fuer das indexDir
    IndexReader ir = null;
    try {
        if (indexRAM != null) {
            ir = IndexReader.open(indexRAM);
        } else {
            ir = IndexReader.open(FSDirectory.open(new File(indexName)));
        }
    } catch (CorruptIndexException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    // Hole Anzahl Dokumente im Index
    int N = ir.numDocs();

    // ffne Ausgabedatei
    FileWriter fw;
    try {
        fw = new FileWriter(outputFileName);
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    // durchlaufe alle Dokumente
    int corrupt = 0;
    int ioerror = 0;
    int i = 0;
    for (i = 0; i < N; i++) {
        Document doc = null;
        try {
            doc = ir.document(i);
        } catch (CorruptIndexException ex) {
            corrupt++;
            continue;
        } catch (IOException ex) {
            ioerror++;
            continue;
        }
        // Wort Nr. i holen
        String word = doc.get("word");
        // Frequenz von Wort i holen
        int f = Integer.parseInt(doc.get("freq"));
        try {
            // Wort und Frequenz in Ausgabe schreiben
            fw.write(word + "\t" + f + "\n");
        } catch (IOException ex) {
            System.out.println(DISCO.class.getName() + ": word " + i + ": " + ex);
            return i;
        }
        // Info ausgeben
        if (i % 100 == 0) {
            System.out.print("\r" + i);
        }
    }
    System.out.println();
    if (corrupt > 0 || ioerror > 0) {
        int e = corrupt + ioerror;
        System.out.println("*** WARNING! ***");
        System.out.println("The language data packet \"" + indexName + "\" " + "has " + e + " defect entries ("
                + corrupt + " corrupt, " + ioerror + " IO errors)");
        System.out.println("All functioning words have been written to " + outputFileName);
    }

    // aufrumen
    try {
        fw.close();
        ir.close();
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    return (i - corrupt - ioerror);
}

From source file:de.mirkosertic.desktopsearch.LuceneIndexHandler.java

License:Open Source License

public void cleanupDeadContent() throws IOException {
    searcherManager.maybeRefreshBlocking();
    IndexSearcher theSearcher = searcherManager.acquire();

    try {/*from  ww  w.  j ava  2s  . c o m*/
        IndexReader theReader = theSearcher.getIndexReader();
        for (int i = 0; i < theReader.maxDoc(); i++) {
            Document theDocument = theReader.document(i);
            File theFile = new File(theDocument.getField(IndexFields.FILENAME).stringValue());
            if (!theFile.exists()) {
                LOGGER.info("Removing file " + theFile + " from index as it does not exist anymore.");
                String theUniqueID = theDocument.getField(IndexFields.UNIQUEID).stringValue();
                indexWriter.deleteDocuments(new Term(IndexFields.UNIQUEID, theUniqueID));
            }
        }
    } finally {
        searcherManager.release(theSearcher);
    }
}

From source file:de.schlund.pfixcore.lucefix.PfixReadjustment.java

License:Open Source License

/**
 * Checks list of include parts for changes and updates search index.
 *///w  w  w  .j  a va  2  s  . c o m
public void readjust() {
    Collection<Tripel> partsKnownByPustefix = getUsedTripels();
    IndexReader reader = null;
    PfixQueueManager queue;
    boolean jobDone;
    long startLoop, stopLoop, startCollect, stopCollect, startIndexLoop, stopIndexLoop, startAddLoop,
            stopAddLoop;

    long collectTime = 0;

    int knownDocsSize, newDocs, deleteDocs, numDocs;

    startLoop = stopLoop = startCollect = stopCollect = startIndexLoop = stopIndexLoop = startAddLoop = stopAddLoop = 0;
    newDocs = knownDocsSize = deleteDocs = numDocs = 0;

    startLoop = System.currentTimeMillis();
    Set<Tripel> tripelsToIndex = new TreeSet<Tripel>();

    queue = PfixQueueManager.getInstance(null);
    try {
        jobDone = false;
        startCollect = System.currentTimeMillis();
        partsKnownByPustefix = getUsedTripels();
        stopCollect = System.currentTimeMillis();
        collectTime = stopCollect - startCollect;
        knownDocsSize = partsKnownByPustefix.size();

        try {
            reader = IndexReader.open(LUCENE_DATA);
        } catch (IOException ioe) {
            LOG.warn("broken or nonexistant database -> will queue ALL known parts");

            for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) {
                Tripel element = iter.next();
                element.setType(Tripel.Type.INSERT);
                newDocs++;
                if (!tripelsToIndex.add(element)) {
                    LOG.debug("duplicated insert");
                }
            }
            jobDone = true;
        }
        if (!jobDone) {
            numDocs = reader.numDocs();
            startIndexLoop = System.currentTimeMillis();
            docloop: for (int i = 0; i < numDocs; i++) {

                Document currentdoc;
                try {
                    currentdoc = reader.document(i);
                } catch (RuntimeException e) {
                    // this happens if we want to access a deleted
                    // document -> continue
                    continue docloop;
                }

                // check if needed
                String path = currentdoc.get(PreDoc.PATH);
                Tripel pfixTripel = new Tripel(path, null);

                if (partsKnownByPustefix.contains(pfixTripel)) {

                    // checkTs
                    File f = new File(GlobalConfig.getDocroot(), currentdoc.get(PreDoc.FILENAME));
                    if (f.lastModified() != DateField.stringToTime(currentdoc.get(PreDoc.LASTTOUCH))) {
                        // ts differs
                        pfixTripel.setType(Tripel.Type.INSERT);
                        LOG.debug("TS differs: " + pfixTripel);
                        newDocs++;
                        if (!tripelsToIndex.add(pfixTripel)) {
                            LOG.debug("duplicated insert " + pfixTripel);
                        }
                    }
                    partsKnownByPustefix.remove(pfixTripel);
                } else {
                    // part not needed anymore
                    Tripel newTripel = new Tripel(currentdoc.get(PreDoc.PATH), Tripel.Type.DELETE);
                    deleteDocs++;
                    queue.queue(newTripel);
                }

            }
            stopIndexLoop = System.currentTimeMillis();

            // now partsKnownByPustefix only contains parts which are NOT indexed...
            startAddLoop = System.currentTimeMillis();
            for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) {
                Tripel element = iter.next();
                element.setType(Tripel.Type.INSERT);
                // LOG.debug("adding " + element + " to queue
                // (INDEX)");
                newDocs++;
                if (!tripelsToIndex.add(element)) {
                    LOG.debug("duplicated insert " + element);
                }
                // queue.queue(element);
            }

            stopAddLoop = System.currentTimeMillis();
        }
    } catch (IOException ioe) {
        LOG.error("error reading index", ioe);
    }

    // its a treeset, it is already sorted :)
    // Collections.sort(tripelsToIndex);
    // Collections.
    for (Tripel tripel : tripelsToIndex) {
        queue.queue(tripel);
    }

    stopLoop = System.currentTimeMillis();
    long needed = stopLoop - startLoop;
    if (newDocs != 0 || deleteDocs != 0) {
        LOG.debug(needed + "ms (getUsedTripels(): " + collectTime + "ms (" + knownDocsSize + "u) indexloop: "
                + (stopIndexLoop - startIndexLoop) + "|" + (stopAddLoop - startAddLoop) + "ms (" + numDocs
                + "u), added " + newDocs + "+" + deleteDocs + " queueitems");
    }

    try {
        if (reader != null) {
            reader.close();
            reader = null;
        }
    } catch (IOException e) {
        LOG.error("error while closing reader", e);
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexerTest.java

License:Apache License

@Test
public void testSearch() throws Exception {
    // Check if fields and all documents exists
    IndexReader ir0 = IndexReader.open(FSDirectory.open(targetIndex0));
    IndexReader ir1 = IndexReader.open(FSDirectory.open(targetIndex1));
    Assert.assertEquals("Number of documents", 3, ir0.numDocs() + ir1.numDocs());

    Document doc = ir0.document(0);
    Assert.assertNotNull("Field: gram", doc.getField("gram"));
    Assert.assertNotNull("Field: freq", doc.getField("freq"));
    ir0.close();//from ww w  . ja  va 2 s . c om
    ir1.close();

    // Search on the index
    Finder f = new Finder(index, jWeb1T);

    Assert.assertEquals(f.find("relax").size(), 3);
    Assert.assertEquals(f.find("couch").size(), 1);
    Assert.assertEquals(f.find("relax couch").size(), 1);
    Assert.assertEquals(f.find("couchdb").size(), 1);
}

From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.ranking.TotalFreqAmout.java

License:Open Source License

/**
 * Adds all frequency values for a special directory
 * @return/* w  w w  . j a  v  a 2s . c om*/
 * @throws IOException
 */
protected BigInteger countFreq(FSDirectory dir) throws IOException {
    BigInteger count = BigInteger.valueOf(0);

    IndexReader reader = IndexReader.open(dir);
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            continue;
        }

        Document doc = reader.document(i);
        count = count.add(new BigInteger(doc.get("freq")));
    }

    return count;
}

From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.web1t.LuceneIndexerTest.java

License:Open Source License

@Test
public void testSearch() throws Exception {
    // Check if fields and all documents exists
    IndexReader ir = IndexReader.open(FSDirectory.open(targetIndex));
    Assert.assertEquals("Number of documents", 2, ir.numDocs());
    Document doc = ir.document(0);
    Assert.assertNotNull("Field: gram", doc.getField("gram"));
    Assert.assertNotNull("Field: freq", doc.getField("freq"));
    ir.close();//w w w  . j  a  va 2  s  .co m

    // Search on the index
    IndexSearcher searcher = new IndexSearcher(FSDirectory.open(targetIndex));
    QueryParser p = new QueryParser(Version.LUCENE_30, "token", new StandardAnalyzer(Version.LUCENE_30));
    Query q = p.parse("gram:relax");
    Assert.assertEquals("Hit count 'Relax'", 2, searcher.search(q, 100).totalHits);

    q = p.parse("gram:couch");
    Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits);

    q = p.parse("gram:relax AND gram:couch");
    Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits);

    q = p.parse("gram:couchdb");
    Assert.assertEquals("Hit count 'couchdb'", 1, searcher.search(q, 100).totalHits);
    searcher.close();
}

From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.web1t.LuceneIndexerTest.java

License:Open Source License

@Test
public void testData() throws Exception {
    IndexReader ir = IndexReader.open(FSDirectory.open(targetIndex));
    IndexSearcher searcher = new IndexSearcher(FSDirectory.open(targetIndex));
    QueryParser p = new QueryParser(Version.LUCENE_30, "gram", new StandardAnalyzer(Version.LUCENE_30));

    // Test if all data is set correct
    Query q = p.parse("gram:couch");
    Document doc = ir.document(searcher.search(q, 100).scoreDocs[0].doc);
    Assert.assertEquals(new Integer(100), Integer.valueOf(doc.get("freq")));
    Assert.assertEquals("relax on the couch", doc.get("gram"));

    ir.close();/*from  w ww.j a  va 2s . c  om*/
    searcher.close();
}

From source file:de.uni_koeln.spinfo.maalr.lucene.core.Dictionary.java

License:Apache License

public IndexStatistics getIndexStatistics() {
    final IndexStatistics statistics = new IndexStatistics();
    try {/*from  w ww. j  a v a2s  .com*/
        queue.push(new IndexOperation() {

            @Override
            public void execute() throws Exception {
                int all = indexProvider.getSearcher().getIndexReader().numDocs();
                int unverified = 0;
                int approved = 0;
                int unknown = 0;
                IndexReader reader = indexProvider.getSearcher().getIndexReader();
                HashMap<String, Integer> byCategory = new HashMap<String, Integer>();
                for (int i = 0; i < all; i++) {
                    Document document = reader.document(i);
                    String verification = document.get(LemmaVersion.VERIFICATION);
                    try {
                        if (Verification.ACCEPTED.equals(Verification.valueOf(verification))) {
                            approved++;
                        } else if (Verification.UNVERIFIED.equals(Verification.valueOf(verification))) {
                            unverified++;
                        } else {
                            unknown++;
                        }
                    } catch (Exception e) {
                        unknown++;
                    }
                    String overlayA = document.get(LemmaVersion.OVERLAY_LANG1);
                    if (overlayA != null) {
                        Integer old = byCategory.get(overlayA);
                        if (old == null)
                            old = 0;
                        byCategory.put(overlayA, old + 1);
                    }
                    String overlayB = document.get(LemmaVersion.OVERLAY_LANG2);
                    if (overlayB != null) {
                        Integer old = byCategory.get(overlayB);
                        if (old == null)
                            old = 0;
                        byCategory.put(overlayB, old + 1);
                    }

                }
                statistics.setOverlayCount(byCategory);
                statistics.setNumberOfEntries(all);
                statistics.setUnverifiedEntries(unverified);
                statistics.setApprovedEntries(approved);
                statistics.setUnknown(unknown);
                statistics.setLastUpdated(indexCreator.getLastUpdated());
            }
        });
        return statistics;
    } catch (Exception e) {
        return new IndexStatistics();
    }

}