Example usage for org.apache.lucene.index Terms size

List of usage examples for org.apache.lucene.index Terms size

Introduction

In this page you can find the example usage for org.apache.lucene.index Terms size.

Prototype

public abstract long size() throws IOException;

Source Link

Document

Returns the number of terms for this field, or -1 if this measure isn't stored by the codec.

Usage

From source file:com.github.flaxsearch.api.TermsData.java

License:Apache License

public TermsData(Terms terms, List<String> termsList, String encoding) throws IOException {
    this.termCount = terms.size();
    this.docCount = terms.getDocCount();
    this.minTerm = BytesRefUtils.encode(terms.getMin(), encoding);
    this.maxTerm = BytesRefUtils.encode(terms.getMax(), encoding);
    this.terms = termsList;
}

From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtils.java

License:Apache License

/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 * @param docTerms term vectors for a given document
 * @param fieldTerms field term vectors//from w  w  w .java2  s . c  o  m
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
    TermsEnum fieldTermsEnum = fieldTerms.iterator();
    Double[] freqVector = null;
    if (docTerms != null && fieldTerms.size() > -1) {
        freqVector = new Double[(int) fieldTerms.size()];
        int i = 0;
        TermsEnum docTermsEnum = docTerms.iterator();
        BytesRef term;
        while ((term = fieldTermsEnum.next()) != null) {
            TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
            if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
                docTermsEnum = docTerms.iterator();
            }
            if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
                long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
                freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
            } else {
                freqVector[i] = 0d;
            }
            i++;
        }
    }
    return freqVector;
}

From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtils.java

License:Apache License

/**
 * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 * @param docTerms term vectors for a given document
 * @return a dense vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 *//*w ww  . j  ava  2 s  .com*/
public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException {
    Double[] freqVector = null;
    if (docTerms != null) {
        freqVector = new Double[(int) docTerms.size()];
        int i = 0;
        TermsEnum docTermsEnum = docTerms.iterator();

        while (docTermsEnum.next() != null) {
            long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
            freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
            i++;
        }
    }
    return freqVector;
}

From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtilsTest.java

License:Apache License

@Test
public void testSparseFreqDoubleArrayConversion() throws Exception {
    Terms fieldTerms = MultiFields.getTerms(index, "text");
    if (fieldTerms != null && fieldTerms.size() != -1) {
        IndexSearcher indexSearcher = new IndexSearcher(index);
        for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) {
            Terms docTerms = index.getTermVector(scoreDoc.doc, "text");
            Double[] vector = DocToDoubleVectorUtils.toSparseLocalFreqDoubleArray(docTerms, fieldTerms);
            assertNotNull(vector);//w ww. jav a  2  s  .com
            assertTrue(vector.length > 0);
        }
    }
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

/**
 * checks collection-level statistics on Terms
 *//*  w ww  .  j a  v  a  2 s. co m*/
public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception {
    if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) {
        assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount());
    }
    if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) {
        assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq());
    }
    if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) {
        assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq());
    }
    if (leftTerms.size() != -1 && rightTerms.size() != -1) {
        assertEquals(leftTerms.size(), rightTerms.size());
    }
}

From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.QueryUtil.java

License:Open Source License

public List<String> getIndexTerms(final FullTextSession fullTextSession, final int luceneDocumentId,
        final Class<?> clazz, final String field) {
    final IndexReader indexReader = fullTextSession.getSearchFactory().getIndexReaderAccessor().open(clazz);
    try {//from w w  w  . ja  va2 s  .  c  om
        final Terms terms = indexReader.getTermVector(luceneDocumentId, field);
        final List<String> termsList = Lists.newArrayListWithExpectedSize((int) terms.size());

        final TermsEnum termsEnum = terms.iterator();
        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            final String term = text.utf8ToString();
            termsList.add(term);
        }

        return termsList;
    } catch (final IOException e) {
        LOG.error("Could not determine index terms", e);
        return null;
    }
}

From source file:edu.utsa.sifter.som.MainSOM.java

License:Apache License

void initTerms() throws IOException {
    final Terms terms = MultiFields.getTerms(Reader, "body");

    System.out.println("number of terms in index: " + terms.size());
    final PriorityQueue<TermPair> topTerms = new PriorityQueue<TermPair>(Conf.MAX_VECTOR_FEATURES,
            new TermPair.TermPairComparator());

    int num = 0;/*from  w  w w  .  j  a  v a 2 s .  c om*/
    TermsEnum term = terms.iterator(null);
    while (term.next() != null) {
        final int count = term.docFreq();
        final double r = ((double) count) / Reader.numDocs();

        if (Conf.DOC_FREQ_THRESHOLD_LOW <= r && r <= Conf.DOC_FREQ_THRESHOLD_HIGH) {
            final String s = term.term().utf8ToString();
            if (s.length() >= Conf.MIN_SOM_TERM_LENGTH) {
                if (topTerms.size() < Conf.MAX_VECTOR_FEATURES) {
                    topTerms.add(new TermPair(s, count));
                } else if (topTerms.peek().DocCount < count) {
                    topTerms.remove();
                    topTerms.add(new TermPair(s, count));
                }
                ++num;
            }
        }
    }
    System.out.println(num + " terms with in doc frequency range");

    final int numFeatures = Math.min(topTerms.size(), Conf.MAX_VECTOR_FEATURES);
    TermIndices = new HashMap<String, Integer>((numFeatures * 4 + 1) / 3); // respect load factor
    Terms = new java.util.Vector<String>(numFeatures);
    Terms.setSize(numFeatures);
    System.out.println("the top " + numFeatures + " features will be used");
    for (int i = numFeatures - 1; i > -1; --i) { // reverse order, to put top terms first
        TermPair t = topTerms.poll(); // least remaining
        TermIndices.put(t.Term, i);
        Terms.set(i, t.Term);
        // System.out.println("Including term " + t.Term + " (" + t.DocCount + ")");
    }
}

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;/*w w  w  . j  av  a 2  s. c o  m*/

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:indextranslator.BOWTranslator.java

public void translate(String docIdStr, int docId) throws Exception {
    String termText;//from   w w  w . j  av  a2  s  .c  o m
    BytesRef term;
    Terms tfvector;
    TermsEnum termsEnum;
    int tf;

    tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT);
    if (tfvector == null || tfvector.size() == 0)
        return;

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field
    StringBuffer buff = new StringBuffer();

    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        tf = (int) termsEnum.totalTermFreq();
        termText = term.utf8ToString();
        buff.append(dict.getTranslations(termText, tf)).append("\n");
    }

    Document doc = constructDoc(docIdStr, buff.toString());
    writer.addDocument(doc);
}

From source file:io.anserini.index.IndexUtils.java

License:Apache License

void printIndexStats() throws IOException {
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms(LuceneDocumentGenerator.FIELD_BODY);

    System.out.println("Index statistics");
    System.out.println("----------------");
    System.out.println("documents:             " + reader.numDocs());
    System.out.println("documents (non-empty): " + reader.getDocCount(LuceneDocumentGenerator.FIELD_BODY));
    System.out.println("unique terms:          " + terms.size());
    System.out.println(/*from w  w  w. j av a2 s.c om*/
            "total terms:           " + reader.getSumTotalTermFreq(LuceneDocumentGenerator.FIELD_BODY));

    System.out.println("stored fields:");

    FieldInfos fieldInfos = MultiFields.getMergedFieldInfos(reader);
    for (String fd : fields) {
        FieldInfo fi = fieldInfos.fieldInfo(fd);
        System.out.println("  " + fd + " (" + "indexOption: " + fi.getIndexOptions() + ", hasVectors: "
                + fi.hasVectors() + ", hasPayloads: " + fi.hasPayloads() + ")");
    }
}