List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:com.sun.socialsite.business.impl.LuceneSearchManagerImpl.java
License:Open Source License
/** * @return false if the index entry was not updated because it * was already current; true otherwise./* w w w . j av a 2 s. co m*/ */ public boolean addToIndex(final Profile profile) throws IOException { boolean needNewEntry = true; String key = getKey(profile); String userId = profile.getUserId(); String firstName = profile.getFirstName(); String middleName = profile.getMiddleName(); String lastName = profile.getLastName(); String nickName = profile.getNickName(); String primaryEmail = profile.getPrimaryEmail(); String displayName = profile.getDisplayName(); IndexReader reader = IndexReader.open(indexDir); TermDocs termDocs = reader.termDocs(new Term("key", key)); while (termDocs.next()) { Document existingDoc = reader.document(termDocs.doc()); if (areEqual("profile", existingDoc.get("class")) && areEqual(userId, existingDoc.get("userId")) && areEqual(firstName, existingDoc.get("firstName")) && areEqual(middleName, existingDoc.get("middleName")) && areEqual(lastName, existingDoc.get("lastName")) && areEqual(nickName, existingDoc.get("nickName")) && areEqual(primaryEmail, existingDoc.get("primaryEmail")) && areEqual(displayName, existingDoc.get("displayName"))) { needNewEntry = false; } } termDocs.close(); reader.close(); if (needNewEntry) { Document newDoc = new Document(); newDoc.add(new Field("key", key, Field.Store.YES, Field.Index.UN_TOKENIZED)); newDoc.add(new Field("class", "profile", Field.Store.YES, Field.Index.UN_TOKENIZED)); newDoc.add(new Field("userId", userId, Field.Store.YES, Field.Index.UN_TOKENIZED)); if (firstName != null) newDoc.add(new Field("firstName", firstName, Field.Store.YES, Field.Index.TOKENIZED)); if (middleName != null) newDoc.add(new Field("middleName", middleName, Field.Store.YES, Field.Index.TOKENIZED)); if (lastName != null) newDoc.add(new Field("lastName", lastName, Field.Store.YES, Field.Index.TOKENIZED)); if (nickName != null) newDoc.add(new Field("nickName", nickName, Field.Store.YES, Field.Index.TOKENIZED)); if (primaryEmail != null) newDoc.add(new Field("primaryEmail", primaryEmail, Field.Store.YES, Field.Index.UN_TOKENIZED)); if (displayName != null) newDoc.add(new Field("displayName", displayName, Field.Store.YES, Field.Index.TOKENIZED)); IndexWriter writer = null; try { writer = new IndexWriter(indexDir, analyzer, false); writer.deleteDocuments(new Term("key", key)); // Delete old entry, if present writer.addDocument(newDoc); } finally { if (writer != null) try { writer.close(); } catch (Exception e) { } ; } log.trace(String.format( "Indexed profile[userId=%s,firstName=%s,lastName=%s,nickName=%s,primaryEmail=%s,displayName=%s]", userId, firstName, lastName, nickName, primaryEmail, displayName)); } return needNewEntry; }
From source file:com.tamingtext.classifier.bayes.ExtractTrainingData.java
License:Apache License
/** * Extract training data from a lucene index. * <p>/*from www. ja v a2 s .co m*/ * Iterates over documents in the lucene index, the values in the categoryFields are inspected and if found to * contain any of the strings found in the category file, a training data item will be emitted, assigned to the * matching category and containing the terms found in the fields listed in textFields. Output is written to * the output directory with one file per category. * <p> * The category file contains one line per category, each line contains a number of whitespace delimited strings. * The first string on each line is the category name, while subsequent strings will be used to identify documents * that belong in that category. * <p> * 'Technology Computers Macintosh' will cause documents that contain either 'Technology', 'Computers' or 'Machintosh' * in one of their categoryFields to be assigned to the 'Technology' category. * * * @param indexDir * directory of lucene index to extract from * * @param maxDocs * the maximum number of documents to process. * * @param categoryFile * file containing category strings to extract * * @param categoryFields * list of fields to match against category data * * @param textFields * list of fields containing terms to extract * * @param outputDir * directory to write output to * * @throws IOException */ public static void extractTraininingData(File indexDir, File categoryFile, Collection<String> categoryFields, Collection<String> textFields, File outputDir, boolean useTermVectors) throws IOException { log.info("Index dir: " + indexDir); log.info("Category file: " + categoryFile); log.info("Output dir: " + outputDir); log.info("Category fields: " + categoryFields.toString()); log.info("Text fields: " + textFields.toString()); log.info("Use Term Vectors?: " + useTermVectors); OpenObjectIntHashMap<String> categoryCounts = new OpenObjectIntHashMap<String>(); Map<String, List<String>> categories = readCategoryFile(categoryFile); Directory dir = FSDirectory.open(indexDir); IndexReader reader = IndexReader.open(dir, true); int max = reader.maxDoc(); StringBuilder buf = new StringBuilder(); for (int i = 0; i < max; i++) { if (!reader.isDeleted(i)) { Document d = reader.document(i); String category = null; // determine whether any of the fields in this document contain a // category in the category list fields: for (String field : categoryFields) { for (Field f : d.getFields(field)) { if (f.isStored() && !f.isBinary()) { String fieldValue = f.stringValue().toLowerCase(); for (String cat : categories.keySet()) { List<String> cats = categories.get(cat); for (String c : cats) { if (fieldValue.contains(c)) { category = cat; break fields; } } } } } } if (category == null) continue; // append the terms from each of the textFields to the training data for this document. buf.setLength(0); for (String field : textFields) { if (useTermVectors) { appendVectorTerms(buf, reader.getTermFreqVector(i, field)); } else { appendFieldText(buf, d.getField(field)); } } getWriterForCategory(outputDir, category).printf("%s\t%s\n", category, buf.toString()); categoryCounts.adjustOrPutValue(category, 1, 1); } } if (log.isInfoEnabled()) { StringBuilder b = new StringBuilder(); b.append("\nCatagory document counts:\n"); LinkedList<String> keyList = new LinkedList<String>(); categoryCounts.keysSortedByValue(keyList); String key; while (!keyList.isEmpty()) { key = keyList.removeLast(); b.append(categoryCounts.get(key)).append('\t').append(key).append('\n'); } log.info(b.toString()); } }
From source file:com.tamingtext.classifier.mlt.MoreLikeThisQueryTest.java
License:Apache License
@Test public void testMoreLikeThisQuery() throws Exception { //<start id="lucene.examples.mlt.setup"/> Directory directory = FSDirectory.open(new File(modelPath)); IndexReader indexReader = IndexReader.open(directory); //<co id="mlt.indexsetup"/> IndexSearcher indexSearcher = new IndexSearcher(indexReader); Analyzer analyzer //<co id="mlt.analyzersetup"/> = new EnglishAnalyzer(Version.LUCENE_36); if (nGramSize > 1) { //<co id="mlt.ngramsetup"/> analyzer = new ShingleAnalyzerWrapper(analyzer, nGramSize, nGramSize); }/* w w w .jav a 2s.c om*/ MoreLikeThis moreLikeThis = new MoreLikeThis(indexReader); //<co id="mlt.configure"/> moreLikeThis.setAnalyzer(analyzer); moreLikeThis.setFieldNames(new String[] { "content" }); /*<calloutlist> <callout arearefs="mlt.indexsetup">Open Index</callout> <callout arearefs="mlt.analyzersetup">Setup Analyzer</callout> <callout arearefs="mlt.ngramsetup">Setup NGrams</callout> <callout arearefs="mlt.configure">Create <classname>MoreLikeThis</classname></callout> </calloutlist>*/ //<end id="lucene.examples.mlt.setup"/> // for testing against the same corpus moreLikeThis.setMinTermFreq(1); moreLikeThis.setMinDocFreq(1); //<start id="lucene.examples.mlt.query"/> Reader reader = new FileReader(inputPath); //<co id="mlt.query"/> Query query = moreLikeThis.like(reader); TopDocs results = indexSearcher.search(query, maxResults); //<co id="mlt.search"/> HashMap<String, CategoryHits> categoryHash = new HashMap<String, CategoryHits>(); for (ScoreDoc sd : results.scoreDocs) { //<co id="mlt.collect"/> Document d = indexReader.document(sd.doc); Fieldable f = d.getFieldable(categoryFieldName); String cat = f.stringValue(); CategoryHits ch = categoryHash.get(cat); if (ch == null) { ch = new CategoryHits(); ch.setLabel(cat); categoryHash.put(cat, ch); } ch.incrementScore(sd.score); } SortedSet<CategoryHits> sortedCats //<co id="mlt.rank"/> = new TreeSet<CategoryHits>(CategoryHits.byScoreComparator()); sortedCats.addAll(categoryHash.values()); for (CategoryHits c : sortedCats) { //<co id="mlt.display"/> System.out.println(c.getLabel() + "\t" + c.getScore()); } /*<calloutlist> <callout arearefs="mlt.query">Create Query</callout> <callout arearefs="mlt.search">Perform Search</callout> <callout arearefs="mlt.collect">Collect Results</callout> <callout arearefs="mlt.rank">Rank Categories</callout> <callout arearefs="mlt.display">Display Categories</callout> </calloutlist>*/ //<end id="lucene.examples.mlt.query"/> }
From source file:com.tamingtext.tagging.LuceneCategoryExtractor.java
License:Apache License
/** dump the values stored in the specified field for each document. * /*from w w w . ja v a 2s. c o m*/ * <pre>term(tab)document_frequency</pre> * * @param indexDir the index to read. * @param field the name of the field. * @param out the print writer output will be written to * @throws IOException */ public static void dumpDocumentFields(File indexDir, String field, long maxDocs, PrintWriter out) throws IOException { Directory dir = FSDirectory.open(indexDir); IndexReader reader = IndexReader.open(dir, true); int max = reader.maxDoc(); for (int i = 0; i < max; i++) { if (!reader.isDeleted(i)) { Document d = reader.document(i); for (Field f : d.getFields(field)) { if (f.isStored() && !f.isBinary()) { String value = f.stringValue(); if (value != null) { out.printf("%s\n", value); } } } } } }
From source file:com.tinkerpop.graph.benchmark.index.LuceneKeyToNodeIdIndexImpl.java
License:Apache License
@Override public long getGraphNodeId(String udk) { Long result = hotCache.get(udk); if (result == null) { //fail fast on bloom int bloomKey = Math.abs(udk.hashCode() % bloomFilterSize); if (!bloomFilter.fastGet(bloomKey)) { //Not seen - fail bloomReadSaves++;// www. java2 s. c o m return -1; } result = uncommittedKeyBuffer.get(udk); if (result != null) { return result; } if (reader == null) { try { reader = IndexReader.open(dir, true); subreaders = reader.getSequentialSubReaders(); } catch (Exception e) { throw new RuntimeException(e); } } try { Term searchTerm = term.createTerm(udk); for (IndexReader r : subreaders) { TermDocs td = r.termDocs(searchTerm); if (td.next()) { Document doc = r.document(td.doc()); result = Long.parseLong(doc.get("id")); hotCache.put(udk, result); successfulLuceneReads++; return result; } } failedLuceneReads++; } catch (Exception e) { throw new RuntimeException(e); } } else { hotCacheHits++; } if (result == null) { return -1; } else { return result; } }
From source file:com.zimbra.cs.index.RawIndexEditor.java
License:Open Source License
void dumpAll() throws IOException { IndexReader reader = IndexReader.open(luceneDirectory); try {//from ww w . j a v a 2 s . c o m int maxDoc = reader.maxDoc(); System.out.println("There are " + maxDoc + " documents in this index."); for (int i = 0; i < maxDoc; i++) { dumpDocument(reader.document(i), reader.isDeleted(i)); } } finally { reader.close(); } }
From source file:com.zimbra.cs.rmgmt.RemoteMailQueue.java
License:Open Source License
private void list0(SearchResult result, IndexReader indexReader, int offset, int limit) throws IOException { if (ZimbraLog.rmgmt.isDebugEnabled()) { ZimbraLog.rmgmt.debug("listing offset=" + offset + " limit=" + limit + " " + this); }// w w w . j ava 2s . c o m int max = indexReader.maxDoc(); int skip = 0; int listed = 0; for (int i = 0; i < max; i++) { if (indexReader.isDeleted(i)) { continue; } if (skip < offset) { skip++; continue; } Document doc = indexReader.document(i); Map<QueueAttr, String> qitem = docToQueueItem(doc); result.qitems.add(qitem); listed++; if (listed == limit) { break; } } result.hits = getNumMessages(); }
From source file:db.infiniti.config.HighFreqTerms.java
License:Apache License
private TermStats[] getLowerHigherEqualSpecificFreqTerms(IndexReader reader, int numTerms, String field, ArrayList<String> sentQueries, int specificFreq, boolean allranges) throws Exception { // TODO Auto-generated method stub TermInfoLowerFreqThanX tiqLower = new TermInfoLowerFreqThanX(numTerms, specificFreq); TermInfoHigherFreqThanX tiqHigher = new TermInfoHigherFreqThanX(numTerms, specificFreq); TermInfoEqualFreqThanX tiqEqual = new TermInfoEqualFreqThanX(numTerms, specificFreq); if (field != null) { Document aDoc = reader.document(0); // reader.getTermFreqVector(0, field); TermEnum terms = reader.terms(new Term(field)); do {//from w w w. ja v a 2 s. c om if (terms != null && terms.term() != null) { String textOfTerm = terms.term().text(); if (terms.term().field().equals(field)) { if (!textEditor.isRefinedQueryStopWordLength(textOfTerm) && !sentQueries.contains(textOfTerm)) { tiqLower.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); tiqHigher.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); if (terms.docFreq() == specificFreq) { tiqEqual.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); } } } } } while (terms.next()); } else { TermEnum terms = reader.terms(); while (terms.next()) { String textOfTerm = terms.term().text(); if (!textEditor.isRefinedQueryStopWordLength(textOfTerm) && !sentQueries.contains(textOfTerm)) { tiqLower.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); tiqHigher.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); if (terms.docFreq() == specificFreq) { tiqEqual.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); } } } } TermStats[] result; int count; if (allranges) { result = new TermStats[tiqLower.size() + tiqHigher.size() + tiqEqual.size()]; count = tiqHigher.size() - 1 + tiqLower.size() + tiqEqual.size(); while (tiqHigher.size() != 0) { result[count] = (TermStats) tiqHigher.pop(); count--; } count = tiqLower.size() + tiqEqual.size() - 1; while (tiqEqual.size() != 0) { result[count] = (TermStats) tiqEqual.pop(); count--; } count = tiqLower.size() - 1; while (tiqLower.size() != 0) { result[count] = (TermStats) tiqLower.pop(); count--; } } else { result = new TermStats[tiqEqual.size()]; count = tiqEqual.size() - 1; while (tiqEqual.size() != 0) { result[count] = (TermStats) tiqEqual.pop(); count--; } } // we want highest first so we read the queue and populate the array // starting at the end and work backwards return result; }
From source file:db.infiniti.config.HighFreqTerms.java
License:Apache License
/** * // w w w. j a v a 2 s . c o m * @param reader * @param numTerms * @param field * @param sentQueries * @param initialQuery * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field, ArrayList<String> sentQueries, List<String> initialQuery) throws Exception { TermInfoWiTFQueueForHighFreq tiq = new TermInfoWiTFQueueForHighFreq(numTerms); if (field != null) { Document aDoc = reader.document(0); // reader.getTermFreqVector(0, field); TermEnum terms = reader.terms(new Term(field)); do { if (terms != null && terms.term() != null) { String textOfTerm = terms.term().text(); if (terms.term().field().equals(field)) { if (!textEditor.isRefinedQueryStopWordLength(textOfTerm) && !sentQueries.contains(textOfTerm) && !initialQuery.contains(textOfTerm)) { tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); } } } } while (terms.next()); } else { TermEnum terms = reader.terms(); while (terms.next()) { String textOfTerm = terms.term().text(); if (!textEditor.isRefinedQueryStopWordLength(textOfTerm) && !sentQueries.contains(textOfTerm) && !initialQuery.contains(textOfTerm)) { tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq())); } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = (TermStats) tiq.pop(); count--; } return result; }
From source file:de.dkt.eservices.elucene.indexmanagement.SearchFiles.java
License:Apache License
/** * Searches a query against a field of an index and return hitsToReturn documents. * @param index index where to search for the query text * @param field document field against what to match the query * @param queryString text of the input query * @param hitsToReturn number of documents to be returned * @return JSON format string containing the results information and content * @throws ExternalServiceFailedException *///from w ww .j a va 2s. com public static JSONObject search(String index, String sFields, String sAnalyzers, String queryType, String queryString, String language, int hitsToReturn) throws ExternalServiceFailedException { try { // System.out.println(index+"__"+sFields+"__"+sAnalyzers+"__"+queryType+"__"+language+"__"+hitsToReturn); // System.out.println(indexDirectory); Date start = new Date(); File f = FileFactory.generateFileInstance(indexDirectory + index); if (f == null || !f.exists()) { throw new ExternalServiceFailedException( "Specified index [" + indexDirectory + index + "] does not exists."); } logger.info("Searching in folder: " + f.getAbsolutePath()); Directory dir = FSDirectory.open(f); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); // System.out.println(reader.docFreq(new Term("content", "madrid"))); Document doc = reader.document(0); // System.out.println(reader.numDocs()); // System.out.println(doc); String[] fields = sFields.split(";"); String[] analyzers = sAnalyzers.split(";"); if (fields.length != analyzers.length) { logger.error("The number of fields and analyzers is different"); throw new BadRequestException("The number of fields and analyzers is different"); } //System.out.println("CHECK IF THE QUERY IS WORKING PROPERLY: "+queryString); Query query = OwnQueryParser.parseQuery(queryType, queryString, fields, analyzers, language); //System.out.println("\t QUERY: "+query); TopDocs results = searcher.search(query, hitsToReturn); Explanation exp = searcher.explain(query, 0); // System.out.println("EXPLANATION: "+exp); // System.out.println("TOTAL HITS: " + results.totalHits); Date end = new Date(); logger.info("Time: " + (end.getTime() - start.getTime()) + "ms"); // System.out.println("Time: "+(end.getTime()-start.getTime())+"ms"); JSONObject resultModel = JSONLuceneResultConverter.convertResults(query, searcher, results); reader.close(); return resultModel; } catch (IOException e) { e.printStackTrace(); throw new ExternalServiceFailedException("IOException with message: " + e.getMessage()); } }