List of usage examples for org.apache.lucene.index Terms size
public abstract long size() throws IOException;
From source file:com.github.flaxsearch.api.TermsData.java
License:Apache License
public TermsData(Terms terms, List<String> termsList, String encoding) throws IOException { this.termCount = terms.size(); this.docCount = terms.getDocCount(); this.minTerm = BytesRefUtils.encode(terms.getMin(), encoding); this.maxTerm = BytesRefUtils.encode(terms.getMax(), encoding); this.terms = termsList; }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtils.java
License:Apache License
/** * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc * @param docTerms term vectors for a given document * @param fieldTerms field term vectors//from w w w .java2 s . c o m * @return a sparse vector of <code>Double</code>s as an array * @throws IOException in case accessing the underlying index fails */ public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException { TermsEnum fieldTermsEnum = fieldTerms.iterator(); Double[] freqVector = null; if (docTerms != null && fieldTerms.size() > -1) { freqVector = new Double[(int) fieldTerms.size()]; int i = 0; TermsEnum docTermsEnum = docTerms.iterator(); BytesRef term; while ((term = fieldTermsEnum.next()) != null) { TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term); if (seekStatus.equals(TermsEnum.SeekStatus.END)) { docTermsEnum = docTerms.iterator(); } if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) { long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document freqVector[i] = Long.valueOf(termFreqLocal).doubleValue(); } else { freqVector[i] = 0d; } i++; } } return freqVector; }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtils.java
License:Apache License
/** * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc * @param docTerms term vectors for a given document * @return a dense vector of <code>Double</code>s as an array * @throws IOException in case accessing the underlying index fails *//*w ww . j ava 2 s .com*/ public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException { Double[] freqVector = null; if (docTerms != null) { freqVector = new Double[(int) docTerms.size()]; int i = 0; TermsEnum docTermsEnum = docTerms.iterator(); while (docTermsEnum.next() != null) { long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document freqVector[i] = Long.valueOf(termFreqLocal).doubleValue(); i++; } } return freqVector; }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtilsTest.java
License:Apache License
@Test public void testSparseFreqDoubleArrayConversion() throws Exception { Terms fieldTerms = MultiFields.getTerms(index, "text"); if (fieldTerms != null && fieldTerms.size() != -1) { IndexSearcher indexSearcher = new IndexSearcher(index); for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) { Terms docTerms = index.getTermVector(scoreDoc.doc, "text"); Double[] vector = DocToDoubleVectorUtils.toSparseLocalFreqDoubleArray(docTerms, fieldTerms); assertNotNull(vector);//w ww. jav a 2 s .com assertTrue(vector.length > 0); } } }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks collection-level statistics on Terms *//* w ww . j a v a 2 s. co m*/ public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception { if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) { assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount()); } if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) { assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq()); } if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) { assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq()); } if (leftTerms.size() != -1 && rightTerms.size() != -1) { assertEquals(leftTerms.size(), rightTerms.size()); } }
From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.QueryUtil.java
License:Open Source License
public List<String> getIndexTerms(final FullTextSession fullTextSession, final int luceneDocumentId, final Class<?> clazz, final String field) { final IndexReader indexReader = fullTextSession.getSearchFactory().getIndexReaderAccessor().open(clazz); try {//from w w w . ja va2 s . c om final Terms terms = indexReader.getTermVector(luceneDocumentId, field); final List<String> termsList = Lists.newArrayListWithExpectedSize((int) terms.size()); final TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { final String term = text.utf8ToString(); termsList.add(term); } return termsList; } catch (final IOException e) { LOG.error("Could not determine index terms", e); return null; } }
From source file:edu.utsa.sifter.som.MainSOM.java
License:Apache License
void initTerms() throws IOException { final Terms terms = MultiFields.getTerms(Reader, "body"); System.out.println("number of terms in index: " + terms.size()); final PriorityQueue<TermPair> topTerms = new PriorityQueue<TermPair>(Conf.MAX_VECTOR_FEATURES, new TermPair.TermPairComparator()); int num = 0;/*from w w w . j a v a 2 s . c om*/ TermsEnum term = terms.iterator(null); while (term.next() != null) { final int count = term.docFreq(); final double r = ((double) count) / Reader.numDocs(); if (Conf.DOC_FREQ_THRESHOLD_LOW <= r && r <= Conf.DOC_FREQ_THRESHOLD_HIGH) { final String s = term.term().utf8ToString(); if (s.length() >= Conf.MIN_SOM_TERM_LENGTH) { if (topTerms.size() < Conf.MAX_VECTOR_FEATURES) { topTerms.add(new TermPair(s, count)); } else if (topTerms.peek().DocCount < count) { topTerms.remove(); topTerms.add(new TermPair(s, count)); } ++num; } } } System.out.println(num + " terms with in doc frequency range"); final int numFeatures = Math.min(topTerms.size(), Conf.MAX_VECTOR_FEATURES); TermIndices = new HashMap<String, Integer>((numFeatures * 4 + 1) / 3); // respect load factor Terms = new java.util.Vector<String>(numFeatures); Terms.setSize(numFeatures); System.out.println("the top " + numFeatures + " features will be used"); for (int i = numFeatures - 1; i > -1; --i) { // reverse order, to put top terms first TermPair t = topTerms.poll(); // least remaining TermIndices.put(t.Term, i); Terms.set(i, t.Term); // System.out.println("Including term " + t.Term + " (" + t.DocCount + ")"); } }
From source file:game.TermFreq.java
void loadTfVec() throws Exception { IndexReader reader = retriever.getReader(); long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT); Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT); if (terms == null || terms.size() == 0) return;/*w w w . j av a 2 s. c o m*/ TermsEnum termsEnum; BytesRef term; tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field int doclen = 0; while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); String stem = retriever.analyze(termStr); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf); tfvec.add(tfq); doclen += tf; } } for (TermFreq tf : tfvec) { tf.tf = tf.tf / (float) doclen; // normalize by len float idf = sumDf / reader.docFreq(tf.term); tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf)); } Collections.sort(tfvec); }
From source file:indextranslator.BOWTranslator.java
public void translate(String docIdStr, int docId) throws Exception { String termText;//from w w w . j av a2 s .c o m BytesRef term; Terms tfvector; TermsEnum termsEnum; int tf; tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT); if (tfvector == null || tfvector.size() == 0) return; // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field StringBuffer buff = new StringBuffer(); while ((term = termsEnum.next()) != null) { // explore the terms for this field tf = (int) termsEnum.totalTermFreq(); termText = term.utf8ToString(); buff.append(dict.getTranslations(termText, tf)).append("\n"); } Document doc = constructDoc(docIdStr, buff.toString()); writer.addDocument(doc); }
From source file:io.anserini.index.IndexUtils.java
License:Apache License
void printIndexStats() throws IOException { Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms(LuceneDocumentGenerator.FIELD_BODY); System.out.println("Index statistics"); System.out.println("----------------"); System.out.println("documents: " + reader.numDocs()); System.out.println("documents (non-empty): " + reader.getDocCount(LuceneDocumentGenerator.FIELD_BODY)); System.out.println("unique terms: " + terms.size()); System.out.println(/*from w w w. j av a2 s.c om*/ "total terms: " + reader.getSumTotalTermFreq(LuceneDocumentGenerator.FIELD_BODY)); System.out.println("stored fields:"); FieldInfos fieldInfos = MultiFields.getMergedFieldInfos(reader); for (String fd : fields) { FieldInfo fi = fieldInfos.fieldInfo(fd); System.out.println(" " + fd + " (" + "indexOption: " + fi.getIndexOptions() + ", hasVectors: " + fi.hasVectors() + ", hasPayloads: " + fi.hasPayloads() + ")"); } }