List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:de.ingrid.interfaces.csw.index.impl.IngridGeoTKLuceneIndexer.java
License:EUPL
/** * This method remove documents identified by query from the index. * /*w w w .j av a 2 s.c o m*/ * @param query * @throws ParseException */ public List<String> removeDocumentByQuery(final String queryString) throws ParseException { List<String> deletedRecords = new ArrayList<String>(); try { final QueryParser parser = new QueryParser(Version.LUCENE_36, "anytext", analyzer); Query query = parser.parse(queryString); final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); final IndexWriter writer = new IndexWriter(LuceneUtils.getAppropriateDirectory(getFileDirectory()), config); LOGGER.log(logLevel, "Query:{0}", query); IndexReader reader = IndexReader.open(writer, false); IndexSearcher searcher = new IndexSearcher(reader); TopDocs docs = searcher.search(query, Integer.MAX_VALUE); for (ScoreDoc doc : docs.scoreDocs) { deletedRecords.add(reader.document(doc.doc).get("id")); } writer.deleteDocuments(query); writer.commit(); searcher.close(); reader.close(); writer.close(); } catch (CorruptIndexException ex) { LOGGER.log(Level.WARNING, "CorruptIndexException while indexing document: " + ex.getMessage(), ex); } catch (IOException ex) { LOGGER.log(Level.WARNING, "IOException while indexing document: " + ex.getMessage(), ex); } return deletedRecords; }
From source file:de.linguatools.disco.Compositionality.java
License:Apache License
/** * Find the most similar words in the DISCO word space for an input word * vector. While the word vector can represent a multi-token word (if it was * produced by one of the methods /*from w w w . ja v a2s . c om*/ * <code>Compositionality.composeWordVectors()</code>) the most * similar words will only be single-token words from the index.<br/> * <b>Warning</b>: This method is very time consuming and should only be * used with word spaces that have been loaded into memory! * @param wordvector input word vector * @param disco DISCO word space * @param simMeasure * @return List of all words (with their similarity values) whose similarity * with the <code>wordvector</code> is greater than zero, ordered by * similarity value (highest value first). * @throws java.io.IOException */ public ArrayList<ReturnDataCol> similarWords(HashMap<String, Float> wordvector, DISCO disco, SimilarityMeasures simMeasure) throws IOException { // hole einen IndexReader fuer das indexDir IndexReader ir = disco.getIndexReader(); // durchlaufe alle Dokumente ArrayList<ReturnDataCol> result = new ArrayList(); for (int i = 0; i < ir.numDocs(); i++) { Document doc = null; try { doc = ir.document(i); } catch (CorruptIndexException ex) { continue; } catch (IOException ex) { continue; } // Wortvektor zu Wort Nr. i holen String word = doc.get("word"); HashMap<String, Float> wv = getWordvector(word, disco); // hnlichkeit zwischen Wortvektoren berechnen float sim = semanticSimilarity(wordvector, wv, simMeasure); if (sim > 0.0F) { ReturnDataCol r = new ReturnDataCol(word, sim); result.add(r); } } // nach hchstem hnlichkeitswert sortieren Collections.sort(result, new ValueComparator()); return result; }
From source file:de.linguatools.disco.DISCO.java
License:Apache License
/*************************************************************************** * Run trough all documents (i.e. queryable words) in the index, and retrieve * the word and its frequency. Write both informations to the file named * outputFileName. This method can be used to check index integrity.<br/> * @param outputFileName/*from w w w.j a va2s .c om*/ * @return number of words written to the output file. In case of success the * value is equal to the number of words in the index. */ public int wordFrequencyList(String outputFileName) { // erzeuge einen IndexReader fuer das indexDir IndexReader ir = null; try { if (indexRAM != null) { ir = IndexReader.open(indexRAM); } else { ir = IndexReader.open(FSDirectory.open(new File(indexName))); } } catch (CorruptIndexException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } // Hole Anzahl Dokumente im Index int N = ir.numDocs(); // ffne Ausgabedatei FileWriter fw; try { fw = new FileWriter(outputFileName); } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } // durchlaufe alle Dokumente int corrupt = 0; int ioerror = 0; int i = 0; for (i = 0; i < N; i++) { Document doc = null; try { doc = ir.document(i); } catch (CorruptIndexException ex) { corrupt++; continue; } catch (IOException ex) { ioerror++; continue; } // Wort Nr. i holen String word = doc.get("word"); // Frequenz von Wort i holen int f = Integer.parseInt(doc.get("freq")); try { // Wort und Frequenz in Ausgabe schreiben fw.write(word + "\t" + f + "\n"); } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": word " + i + ": " + ex); return i; } // Info ausgeben if (i % 100 == 0) { System.out.print("\r" + i); } } System.out.println(); if (corrupt > 0 || ioerror > 0) { int e = corrupt + ioerror; System.out.println("*** WARNING! ***"); System.out.println("The language data packet \"" + indexName + "\" " + "has " + e + " defect entries (" + corrupt + " corrupt, " + ioerror + " IO errors)"); System.out.println("All functioning words have been written to " + outputFileName); } // aufrumen try { fw.close(); ir.close(); } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } return (i - corrupt - ioerror); }
From source file:de.mirkosertic.desktopsearch.LuceneIndexHandler.java
License:Open Source License
public void cleanupDeadContent() throws IOException { searcherManager.maybeRefreshBlocking(); IndexSearcher theSearcher = searcherManager.acquire(); try {/*from ww w. j ava 2s . c o m*/ IndexReader theReader = theSearcher.getIndexReader(); for (int i = 0; i < theReader.maxDoc(); i++) { Document theDocument = theReader.document(i); File theFile = new File(theDocument.getField(IndexFields.FILENAME).stringValue()); if (!theFile.exists()) { LOGGER.info("Removing file " + theFile + " from index as it does not exist anymore."); String theUniqueID = theDocument.getField(IndexFields.UNIQUEID).stringValue(); indexWriter.deleteDocuments(new Term(IndexFields.UNIQUEID, theUniqueID)); } } } finally { searcherManager.release(theSearcher); } }
From source file:de.schlund.pfixcore.lucefix.PfixReadjustment.java
License:Open Source License
/** * Checks list of include parts for changes and updates search index. *///w w w .j a va 2 s . c o m public void readjust() { Collection<Tripel> partsKnownByPustefix = getUsedTripels(); IndexReader reader = null; PfixQueueManager queue; boolean jobDone; long startLoop, stopLoop, startCollect, stopCollect, startIndexLoop, stopIndexLoop, startAddLoop, stopAddLoop; long collectTime = 0; int knownDocsSize, newDocs, deleteDocs, numDocs; startLoop = stopLoop = startCollect = stopCollect = startIndexLoop = stopIndexLoop = startAddLoop = stopAddLoop = 0; newDocs = knownDocsSize = deleteDocs = numDocs = 0; startLoop = System.currentTimeMillis(); Set<Tripel> tripelsToIndex = new TreeSet<Tripel>(); queue = PfixQueueManager.getInstance(null); try { jobDone = false; startCollect = System.currentTimeMillis(); partsKnownByPustefix = getUsedTripels(); stopCollect = System.currentTimeMillis(); collectTime = stopCollect - startCollect; knownDocsSize = partsKnownByPustefix.size(); try { reader = IndexReader.open(LUCENE_DATA); } catch (IOException ioe) { LOG.warn("broken or nonexistant database -> will queue ALL known parts"); for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) { Tripel element = iter.next(); element.setType(Tripel.Type.INSERT); newDocs++; if (!tripelsToIndex.add(element)) { LOG.debug("duplicated insert"); } } jobDone = true; } if (!jobDone) { numDocs = reader.numDocs(); startIndexLoop = System.currentTimeMillis(); docloop: for (int i = 0; i < numDocs; i++) { Document currentdoc; try { currentdoc = reader.document(i); } catch (RuntimeException e) { // this happens if we want to access a deleted // document -> continue continue docloop; } // check if needed String path = currentdoc.get(PreDoc.PATH); Tripel pfixTripel = new Tripel(path, null); if (partsKnownByPustefix.contains(pfixTripel)) { // checkTs File f = new File(GlobalConfig.getDocroot(), currentdoc.get(PreDoc.FILENAME)); if (f.lastModified() != DateField.stringToTime(currentdoc.get(PreDoc.LASTTOUCH))) { // ts differs pfixTripel.setType(Tripel.Type.INSERT); LOG.debug("TS differs: " + pfixTripel); newDocs++; if (!tripelsToIndex.add(pfixTripel)) { LOG.debug("duplicated insert " + pfixTripel); } } partsKnownByPustefix.remove(pfixTripel); } else { // part not needed anymore Tripel newTripel = new Tripel(currentdoc.get(PreDoc.PATH), Tripel.Type.DELETE); deleteDocs++; queue.queue(newTripel); } } stopIndexLoop = System.currentTimeMillis(); // now partsKnownByPustefix only contains parts which are NOT indexed... startAddLoop = System.currentTimeMillis(); for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) { Tripel element = iter.next(); element.setType(Tripel.Type.INSERT); // LOG.debug("adding " + element + " to queue // (INDEX)"); newDocs++; if (!tripelsToIndex.add(element)) { LOG.debug("duplicated insert " + element); } // queue.queue(element); } stopAddLoop = System.currentTimeMillis(); } } catch (IOException ioe) { LOG.error("error reading index", ioe); } // its a treeset, it is already sorted :) // Collections.sort(tripelsToIndex); // Collections. for (Tripel tripel : tripelsToIndex) { queue.queue(tripel); } stopLoop = System.currentTimeMillis(); long needed = stopLoop - startLoop; if (newDocs != 0 || deleteDocs != 0) { LOG.debug(needed + "ms (getUsedTripels(): " + collectTime + "ms (" + knownDocsSize + "u) indexloop: " + (stopIndexLoop - startIndexLoop) + "|" + (stopAddLoop - startAddLoop) + "ms (" + numDocs + "u), added " + newDocs + "+" + deleteDocs + " queueitems"); } try { if (reader != null) { reader.close(); reader = null; } } catch (IOException e) { LOG.error("error while closing reader", e); } }
From source file:de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexerTest.java
License:Apache License
@Test public void testSearch() throws Exception { // Check if fields and all documents exists IndexReader ir0 = IndexReader.open(FSDirectory.open(targetIndex0)); IndexReader ir1 = IndexReader.open(FSDirectory.open(targetIndex1)); Assert.assertEquals("Number of documents", 3, ir0.numDocs() + ir1.numDocs()); Document doc = ir0.document(0); Assert.assertNotNull("Field: gram", doc.getField("gram")); Assert.assertNotNull("Field: freq", doc.getField("freq")); ir0.close();//from ww w . ja va 2 s . c om ir1.close(); // Search on the index Finder f = new Finder(index, jWeb1T); Assert.assertEquals(f.find("relax").size(), 3); Assert.assertEquals(f.find("couch").size(), 1); Assert.assertEquals(f.find("relax couch").size(), 1); Assert.assertEquals(f.find("couchdb").size(), 1); }
From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.ranking.TotalFreqAmout.java
License:Open Source License
/** * Adds all frequency values for a special directory * @return/* w w w . j a v a 2s . c om*/ * @throws IOException */ protected BigInteger countFreq(FSDirectory dir) throws IOException { BigInteger count = BigInteger.valueOf(0); IndexReader reader = IndexReader.open(dir); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { continue; } Document doc = reader.document(i); count = count.add(new BigInteger(doc.get("freq"))); } return count; }
From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.web1t.LuceneIndexerTest.java
License:Open Source License
@Test public void testSearch() throws Exception { // Check if fields and all documents exists IndexReader ir = IndexReader.open(FSDirectory.open(targetIndex)); Assert.assertEquals("Number of documents", 2, ir.numDocs()); Document doc = ir.document(0); Assert.assertNotNull("Field: gram", doc.getField("gram")); Assert.assertNotNull("Field: freq", doc.getField("freq")); ir.close();//w w w . j a va 2 s .co m // Search on the index IndexSearcher searcher = new IndexSearcher(FSDirectory.open(targetIndex)); QueryParser p = new QueryParser(Version.LUCENE_30, "token", new StandardAnalyzer(Version.LUCENE_30)); Query q = p.parse("gram:relax"); Assert.assertEquals("Hit count 'Relax'", 2, searcher.search(q, 100).totalHits); q = p.parse("gram:couch"); Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits); q = p.parse("gram:relax AND gram:couch"); Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits); q = p.parse("gram:couchdb"); Assert.assertEquals("Hit count 'couchdb'", 1, searcher.search(q, 100).totalHits); searcher.close(); }
From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.web1t.LuceneIndexerTest.java
License:Open Source License
@Test public void testData() throws Exception { IndexReader ir = IndexReader.open(FSDirectory.open(targetIndex)); IndexSearcher searcher = new IndexSearcher(FSDirectory.open(targetIndex)); QueryParser p = new QueryParser(Version.LUCENE_30, "gram", new StandardAnalyzer(Version.LUCENE_30)); // Test if all data is set correct Query q = p.parse("gram:couch"); Document doc = ir.document(searcher.search(q, 100).scoreDocs[0].doc); Assert.assertEquals(new Integer(100), Integer.valueOf(doc.get("freq"))); Assert.assertEquals("relax on the couch", doc.get("gram")); ir.close();/*from w ww.j a va 2s . c om*/ searcher.close(); }
From source file:de.uni_koeln.spinfo.maalr.lucene.core.Dictionary.java
License:Apache License
public IndexStatistics getIndexStatistics() { final IndexStatistics statistics = new IndexStatistics(); try {/*from w ww. j a v a2s .com*/ queue.push(new IndexOperation() { @Override public void execute() throws Exception { int all = indexProvider.getSearcher().getIndexReader().numDocs(); int unverified = 0; int approved = 0; int unknown = 0; IndexReader reader = indexProvider.getSearcher().getIndexReader(); HashMap<String, Integer> byCategory = new HashMap<String, Integer>(); for (int i = 0; i < all; i++) { Document document = reader.document(i); String verification = document.get(LemmaVersion.VERIFICATION); try { if (Verification.ACCEPTED.equals(Verification.valueOf(verification))) { approved++; } else if (Verification.UNVERIFIED.equals(Verification.valueOf(verification))) { unverified++; } else { unknown++; } } catch (Exception e) { unknown++; } String overlayA = document.get(LemmaVersion.OVERLAY_LANG1); if (overlayA != null) { Integer old = byCategory.get(overlayA); if (old == null) old = 0; byCategory.put(overlayA, old + 1); } String overlayB = document.get(LemmaVersion.OVERLAY_LANG2); if (overlayB != null) { Integer old = byCategory.get(overlayB); if (old == null) old = 0; byCategory.put(overlayB, old + 1); } } statistics.setOverlayCount(byCategory); statistics.setNumberOfEntries(all); statistics.setUnverifiedEntries(unverified); statistics.setApprovedEntries(approved); statistics.setUnknown(unknown); statistics.setLastUpdated(indexCreator.getLastUpdated()); } }); return statistics; } catch (Exception e) { return new IndexStatistics(); } }