List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:info.boytsov.lucene.FreqWordDict.java
License:Open Source License
public FreqWordDict(IndexReader reader, String fieldName, int minTermFreq, int maxTermQty) throws Exception { Fields fields = MultiFields.getFields(reader); terms = fields.terms(fieldName); TreeSet<TermDesc> tmpTerms = new TreeSet<TermDesc>(); TermsEnum termIter = terms.iterator(null); for (int termId = 0; termIter.next() != null; ++termId) { if (termIter.docFreq() >= minTermFreq) { TermDesc ts = new TermDesc(fieldName, termId, termIter.term(), termIter.docFreq()); tmpTerms.add(ts);// w w w . j av a2 s. c om } } termDescPos = new TreeMap<TermDesc, Integer>(); termTextPos = new TreeMap<BytesRef, Integer>(); int pos = 0; for (TermDesc ts : tmpTerms) { termDescPos.put(ts, pos); if (termTextPos.containsKey(ts.text)) { throw new Exception("Bug: the key '" + ts.getText() + "' is already in the map!"); } termTextPos.put(ts.text, pos); if (++pos >= maxTermQty) break; } }
From source file:info.boytsov.lucene.GetTotPostQty.java
License:Open Source License
public static void main(String[] args) { if (args.length != 1) { printUsage();// w w w .j ava2 s . c o m System.exit(1); } String srcDirName = args[0]; System.out.println("Source dir: " + srcDirName); try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName))); int docQty = reader.maxDoc(); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms(FIELD_NAME); long totalInts = 0; int termQty = 0; for (TermsEnum termIter = terms.iterator(null); termIter.next() != null;) { totalInts += termIter.docFreq(); //System.out.println(termQty + " -> " + termIter.docFreq()); ++termQty; if (termQty % 1000000 == 0) System.out.println("Read " + termQty + " dictionary terms"); } System.out.println("Term qty: " + termQty + " Doc qty: " + docQty + " postings qty: " + totalInts); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }
From source file:io.anserini.index.IndexUtils.java
License:Apache License
void printIndexStats() throws IOException { Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms(LuceneDocumentGenerator.FIELD_BODY); System.out.println("Index statistics"); System.out.println("----------------"); System.out.println("documents: " + reader.numDocs()); System.out.println("documents (non-empty): " + reader.getDocCount(LuceneDocumentGenerator.FIELD_BODY)); System.out.println("unique terms: " + terms.size()); System.out.println(//from w w w. j av a 2s . c o m "total terms: " + reader.getSumTotalTermFreq(LuceneDocumentGenerator.FIELD_BODY)); System.out.println("stored fields:"); FieldInfos fieldInfos = MultiFields.getMergedFieldInfos(reader); for (String fd : fields) { FieldInfo fi = fieldInfos.fieldInfo(fd); System.out.println(" " + fd + " (" + "indexOption: " + fi.getIndexOptions() + ", hasVectors: " + fi.hasVectors() + ", hasPayloads: " + fi.hasPayloads() + ")"); } }
From source file:io.datalayer.lucene.frequency.AosFrequencyTerms.java
License:Apache License
/** * /*from w w w . j ava 2s . c o m*/ * @param reader * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static AosTermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames) throws Exception { TermStatsQueue tiq = null; TermsEnum te = null; if (fieldNames != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOGGER.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); for (String field : fieldNames) { Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOGGER.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); Iterator<String> fieldsEnum = fields.iterator(); while (true) { /* * String field = fieldsEnum.next(); * * if (field != null) { Terms terms = fieldsEnum.terms(); te = * terms.iterator(te); fillQueue(te, tiq, field); } else { * break; } */} } AosTermStats[] result = new AosTermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
From source file:it.cnr.ilc.lc.clavius.search.Tester.java
private static void searchWithContext(String term) { try {/*from w w w.j av a 2s .c o m*/ logger.info("searchWithContext(" + term + ")"); SpanQuery spanQuery = new SpanTermQuery(new Term("content", term)); Directory indexDirectory = FSDirectory.open( Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText")); DirectoryReader indexReader = DirectoryReader.open(indexDirectory); IndexSearcher searcher = new IndexSearcher(indexReader); IndexReader reader = searcher.getIndexReader(); //spanQuery = (SpanQuery) spanQuery.rewrite(reader); //SpanWeight weight = (SpanWeight) searcher.createWeight(spanQuery, false); Spans spans = spanQuery.createWeight(searcher, false) .getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS); // Spans spans2 = weight.getSpans(reader.leaves().get(0), // SpanWeight.Postings.OFFSETS); //Spans spans = weight.getSpans(reader.leaves().get(0), SpanWeight.Postings.POSITIONS); ScoreDoc[] sc = searcher.search(spanQuery, 10).scoreDocs; logger.info("hits :" + sc.length); int i; if (null != spans) { // while ((nextDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) { for (int k = 0; k < sc.length; k++) { int docId = sc[k].doc; logger.info("docID: " + docId); int newDocID = spans.advance(docId); logger.info("newDocID: " + newDocID); int nextSpan = -1; while ((nextSpan = spans.nextStartPosition()) != Spans.NO_MORE_POSITIONS) { logger.info("nextSpan : " + nextSpan); logger.info("spans.startPosition(): " + spans.startPosition()); logger.info("spans.endPosition() : " + spans.endPosition()); logger.info("spans.width() : " + spans.width()); Fields fields = reader.getTermVectors(docId); Terms terms = fields.terms("content"); TermsEnum termsEnum = terms.iterator(); BytesRef text; PostingsEnum postingEnum = null; int start = spans.startPosition() - 3; int end = spans.endPosition() + 3; while ((text = termsEnum.next()) != null) { //could store the BytesRef here, but String is easier for this example String s = new String(text.bytes, text.offset, text.length); // DocsAndPositionsEnum positionsEnum = termsEnum.docsAndPositions(null, null); postingEnum = termsEnum.postings(postingEnum); if (postingEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { i = 0; int position = -1; while (i < postingEnum.freq() && (position = postingEnum.nextPosition()) != -1) { if (position >= start && position <= end) { logger.info("pos: " + position + ", term: " + s + " offset: " + text.offset + " length: " + text.length); } i++; } } } } } } else { logger.info("no " + term + " found!"); } } catch (IOException e) { logger.error(e.getMessage()); } logger.info("End."); }
From source file:lucene.searchengine.LuceneSearchEngine.java
public static void main(String[] args) throws IOException { System.out.println(//from w w w . j av a 2 s.c o m "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)"); String indexLocation = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = br.readLine(); LuceneSearchEngine indexer = null; try { indexLocation = s; indexer = new LuceneSearchEngine(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } // =================================================== // read input from user until he enters q for quit // =================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println( "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } // try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } // =================================================== // after adding, we always have to call the // closeIndex, otherwise the index is not created // =================================================== indexer.closeIndex(); // ========================================================= // Now search // ========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); //=========================================================== // GET Term frequency //=========================================================== // Creating a output file to store the term,term_frequency pairs. PrintWriter tfwriter = new PrintWriter("..\\term-frequency.csv"); Fields fields = MultiFields.getFields(reader); HashMap<String, Long> tfmap = new HashMap<String, Long>(); Terms terms = fields.terms("contents"); TermsEnum termsEnum = terms.iterator(null); BytesRef bref = null; while ((bref = termsEnum.next()) != null) { String term_name = new String(bref.bytes, bref.offset, bref.length); Term term_instance = new Term("contents", term_name); long termFrequency = reader.totalTermFreq(term_instance); tfmap.put(term_name, termFrequency); } System.out.println(tfmap.size()); for (String key : tfmap.keySet()) { tfwriter.write(key + "," + tfmap.get(key)); tfwriter.write("\n"); } tfwriter.close(); //==================================================================== // Code END to fetch term frequency //==================================================================== IndexSearcher searcher = new IndexSearcher(reader); s = ""; while (!s.equalsIgnoreCase("q")) { TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("filename") + " score=" + hits[i].score); } // 5. term stats --> watch out for which "version" of the term // must be checked here instead! Term termInstance = new Term("contents", s); long termFreq = reader.totalTermFreq(termInstance); long docCount = reader.docFreq(termInstance); System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount); } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); break; } } }
From source file:lucene.security.index.SecureAtomicReaderTestBase.java
License:Apache License
@Test public void testTermWalk() throws IOException, ParseException { SecureAtomicReader secureReader = getSecureReader(); Fields fields = secureReader.fields(); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef ref;// w w w .java 2 s. c o m while ((ref = termsEnum.next()) != null) { System.out.println(field + " " + ref.utf8ToString()); DocsEnum docsEnum = termsEnum.docs(null, null); int doc; while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { System.out.println(field + " " + ref.utf8ToString() + " " + doc); } } } secureReader.close(); }
From source file:lucene.security.search.DocumentVisibilityFilter.java
License:Apache License
@Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { AtomicReader reader = context.reader(); List<DocIdSet> list = new ArrayList<DocIdSet>(); Fields fields = reader.fields(); Terms terms = fields.terms(_fieldName); if (terms == null) { if (acceptDocs instanceof DocIdSet) { return (DocIdSet) acceptDocs; } else {//from w ww . java 2s . c o m return wrap(acceptDocs); } } TermsEnum iterator = terms.iterator(null); BytesRef bytesRef; DocumentVisibilityEvaluator visibilityEvaluator = new DocumentVisibilityEvaluator(_authorizations); while ((bytesRef = iterator.next()) != null) { if (isVisible(visibilityEvaluator, bytesRef)) { DocIdSet docIdSet = _filterCacheStrategy.getDocIdSet(_fieldName, bytesRef, reader); if (docIdSet != null) { list.add(docIdSet); } else { DocsEnum docsEnum = iterator.docs(acceptDocs, null); list.add(buildCache(reader, docsEnum, bytesRef)); } } } return getLogicalOr(list); }
From source file:lucenetools.TermData.java
License:Apache License
/** * Extract all terms from the indexed "contents" field. Ignore * any terms with freq counts less than the minTermFreq value or * greater than the maxPercentage value. Assign each term a * dictionary index./* w w w.j a va 2s. c o m*/ * * @param reader reader for index * @param dictMap mapping of index to term * @param minTermFreq minimum value for frequency else pruned * @param numDocs total number of documents in corpus * @param maxPercentage maximum value for percentage else pruned * @return boolean whether or not there was an error */ public static boolean extractTerms(IndexReader reader, Map<String, Integer> dictMap, int minTermFreq, int numDocs, int maxPercentage) { // a TreeSet sorts the dictionary strings Set<String> dictionary = new TreeSet<>(); int maxOccurrences = (int) (((double) maxPercentage / 100.0) * (double) numDocs); try { Fields fields = MultiFields.getFields(reader); if (null != fields) { // get terms derived from content Terms terms = fields.terms(CONTENTSFIELD); TermsEnum te = terms.iterator(null); while (te.next() != null) { String term_str = te.term().utf8ToString(); // number of occurrences of this term in all docs (rowsum) int total_term_freq = (int) (te.totalTermFreq()); if (total_term_freq >= minTermFreq) { if (maxPercentage != 100) { if (total_term_freq <= maxOccurrences) { dictionary.add(term_str); } } else { dictionary.add(term_str); } } } } else { System.err.println("No fields found in index."); return false; } } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); return false; } // build a map associating the dictionary index with each term int index = 0; dictMap.clear(); for (String s : dictionary) dictMap.put(s, index++); return true; }
From source file:lux.IndexTestSupport.java
License:Mozilla Public License
public static void printAllTerms(Directory dir, XmlIndexer indexer) throws IOException { DirectoryReader reader = DirectoryReader.open(dir); Fields fields = MultiFields.getFields(reader); System.out.println("Printing all terms (except uri)"); String uriFieldName = indexer.getConfiguration().getFieldName(FieldRole.URI); for (String field : fields) { if (field.equals(uriFieldName)) { continue; }//from w w w .jav a2 s .co m Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; int count = 0; while ((text = termsEnum.next()) != null && count++ < 100) { System.out.println(field + " " + text.utf8ToString() + ' ' + termsEnum.docFreq()); } } reader.close(); }