List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:perf.ShowFields.java
License:Apache License
public static void main(String[] args) throws CorruptIndexException, IOException { DirectoryReader reader = DirectoryReader.open(FSDirectory.open( new File("/home/simon/work/projects/lucene/bench/indices/Standard.work.trunk.wiki.nd0.1M/index"))); Fields fields = MultiFields.getFields(reader); for (String name : fields) { System.out.println(name); if (name.equals("docdate")) { TermsEnum terms = fields.terms(name).iterator(null); BytesRef ref;//from w w w. j ava 2 s . c om int i = 0; while ((ref = terms.next()) != null) { System.out.println(ref.utf8ToString()); if (i++ == 10) { break; } } } } }
From source file:pretraga.IsolationSimilarity.java
public void test(String vec) { List<String> vector = processInput(vec); HashMap<String, Long> map = new HashMap<>(); try {//from www . j a v a2 s . c om Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath()); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); List<Integer> docId = getDocumentsFromVector(vector, reader, searcher); for (int i = 0; i < docId.size(); i++) { Fields ff = reader.getTermVectors(docId.get(i)); Terms terms = ff.terms(CONTENT); TermsEnum te = terms.iterator(); Object tmp = te.next(); while (tmp != null) { BytesRef by = (BytesRef) tmp; String term = by.utf8ToString(); ClassicSimilarity sim = null; if (searcher.getSimilarity(true) instanceof ClassicSimilarity) { sim = (ClassicSimilarity) searcher.getSimilarity(true); } float idf = sim.idf(te.docFreq(), reader.maxDoc()); float tf = sim.tf(te.totalTermFreq()); //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq()); TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq()); CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(), terms.getSumTotalTermFreq(), terms.getSumDocFreq()); Document d = reader.document(docId.get(i)); if (vector.contains(term)) { float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0)) .score(docId.get(i), te.totalTermFreq()); System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term); } tmp = te.next(); } /*Iterator<String> ss = ff.iterator(); while (ss.hasNext()) { String fieldString = ss.next(); System.out.println(fieldString); }*/ } } catch (Exception e) { } }
From source file:project.lucene.RelativeTermWeightQuery.java
License:Apache License
public void collectTermContext(IndexReader reader, List<AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException { TermsEnum termsEnum = null;//from ww w. j a va 2 s . c o m for (AtomicReaderContext context : leaves) { final Fields fields = context.reader().fields(); if (fields == null) { // reader has no fields continue; } for (int i = 0; i < queryTerms.length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; final Terms terms = fields.terms(term.field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.iterator(termsEnum); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { contextArray[i] = new TermContext(reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } }
From source file:searchenginelucene.LuceneSearchEngine.java
public static void getTermFrequencyPairs(String indexLocation) throws IOException { Map<String, Integer> termfrequency = new HashMap<String, Integer>(); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); // Temporary location to store the interediate term frequency results PrintWriter writer_tf = new PrintWriter("..\\terf-frequency.csv"); int docnum = reader.numDocs(); // System.out.println("docnum:" + docnum); Fields fields1 = MultiFields.getFields(reader); for (String field : fields1) { Terms terms1 = fields1.terms("contents"); TermsEnum termsEnum = terms1.iterator(null); int noWords = 0; while (termsEnum.next() != null) { noWords++;//ww w . j a va 2 s . co m int count = 0; DocsEnum docsEnum = termsEnum.docs(null, null); int docIdEnum; //System.out.print("The term is->" + termsEnum.term().utf8ToString()); while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { count += docsEnum.freq(); } //System.out.println("count:" + count); termfrequency.put(termsEnum.term().utf8ToString(), count); } System.out.println("Total Number of Words:" + noWords); } // ========================================================= // Write the terms anf their frequencies in a file // ========================================================= for (String key : termfrequency.keySet()) { writer_tf.print(key + ","); writer_tf.println(termfrequency.get(key)); } writer_tf.close(); }
From source file:stackoverflow.lucene.modified.MoreLikeThis.java
License:Apache License
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms *//*from ww w.j a va 2 s . c om*/ public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<String, Int>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
From source file:tech.beshu.ror.es.security.DocumentFieldReader.java
License:Open Source License
@Override public Fields getTermVectors(int docID) throws IOException { Fields original = in.getTermVectors(docID); return new Fields() { @Override// www . j av a 2 s . c om public Iterator<String> iterator() { return Iterators.filter(original.iterator(), s -> policy.canKeep(s)); } @Override public Terms terms(String field) throws IOException { return policy.canKeep(field) ? original.terms(field) : null; } @Override public int size() { return remainingFieldsInfo.size(); } }; }
From source file:tw.com.kyle.luminance.LumReader.java
public TokenStream GetTokenStream(int docId, String field) throws IOException { Fields tvFields = reader.getTermVectors(docId); if (tvFields.terms(field) != null) { TokenStream tokenStream = TokenSources.getTokenStream(field, tvFields, "", new StandardAnalyzer(), -1); return tokenStream; } else {//from w w w . j av a2 s. c o m return null; } }
From source file:utils.HighFreqTerms.java
License:Apache License
/** * // w w w . j a va 2 s. c o m * @param reader * @param numTerms * @param fieldNames * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames) throws Exception { TermStatsQueue tiq = null; TermsEnum te = null; if (fieldNames != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); for (String field : fieldNames) { Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(); fillQueue(te, tiq, field); } } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); Iterator<String> fieldIterator = fields.iterator(); while (fieldIterator.hasNext()) { String field = fieldIterator.next(); Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(); fillQueue(te, tiq, field); } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }