Example usage for org.apache.lucene.index TermsEnum docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index TermsEnum docFreq.

Prototype

public abstract int docFreq() throws IOException;

Source Link

Document

Returns the number of documents containing the current term.

Usage

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseIndex(IndexReader d1Index) {
    try {/* w  ww  .j  a  v  a2  s. co m*/
        int[] documentIds = getDocumentIds(d1Index);
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (termsEnum.docFreq() < 2) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                UnilateralBlock block = new UnilateralBlock(idsArray);
                blocks.add(block);
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private List<Entry<String, Float>> getTermScoreList(Directory directory)
        throws CorruptIndexException, IOException {

    Map<String, Float> termScoreMap = new HashMap<>();

    ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> {
            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;
                int docsNum = idxReader.numDocs();

                BytesRef text;/*from   w  ww .j a v  a2s  .c  o  m*/
                while ((text = termsEnum.next()) != null) {

                    postings = termsEnum.postings(postings);

                    while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                        int freq = postings.freq();
                        float tf = sim.tf(freq);
                        float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                        termScoreMap.put(text.utf8ToString(), BETA * (tf * idf));
                    }
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                try {
                    idxReader.close();
                } catch (IOException ex) {
                    Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        });

    }

    return new ArrayList<>(termScoreMap.entrySet());
}

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private float getScore(Directory directory, String term) throws CorruptIndexException, IOException {

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

        for (LeafReaderContext context : idxReader.leaves()) {
            LeafReader reader = context.reader();

            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;

                BytesRef text;/*from   w ww.j  ava2  s.  co m*/
                while ((text = termsEnum.next()) != null) {
                    postings = termsEnum.postings(postings);
                    if (text.utf8ToString().equalsIgnoreCase(term)) {

                        while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                            int freq = postings.freq();
                            float tf = sim.tf(freq);
                            float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                            return tf * idf;
                        }
                    }
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

    }

    return 0;
}

From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION));

    CommandLine cmdline = null;//from w  w  w. j  a  v  a  2s.  c om
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
    int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
    TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);

    long missingCnt = 0;
    int skippedTerms = 0;
    BytesRef bytes = new BytesRef();
    while ((bytes = termsEnum.next()) != null) {
        byte[] buf = new byte[bytes.length];
        System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
        String term = new String(buf, "UTF-8");
        int df = termsEnum.docFreq();
        long cf = termsEnum.totalTermFreq();

        if (df < min) {
            skippedTerms++;
            missingCnt += cf;
            continue;
        }

        out.println(term + "\t" + df + "\t" + cf);
    }

    reader.close();
    out.close();
    System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
}

From source file:com.basistech.lucene.tools.LuceneQueryTool.java

License:Apache License

private void enumerateTerms(String field) throws IOException {
    if (!allFieldNames.contains(field)) {
        throw new RuntimeException("Invalid field name: " + field);
    }// w  w  w  . j a v a 2s . c o m
    List<LeafReaderContext> leaves = indexReader.leaves();
    TermsEnum termsEnum;
    boolean unindexedField = true;
    Map<String, Integer> termCountMap = new TreeMap<>();
    for (LeafReaderContext leaf : leaves) {
        Terms terms = leaf.reader().terms(field);
        if (terms == null) {
            continue;
        }
        unindexedField = false;
        termsEnum = terms.iterator();
        BytesRef bytesRef;
        while ((bytesRef = termsEnum.next()) != null) {
            String term = bytesRef.utf8ToString();
            if (termCountMap.containsKey(term)) {
                termCountMap.put(term, termsEnum.docFreq() + termCountMap.get(term));
            } else {
                termCountMap.put(term, termsEnum.docFreq());
            }
        }
    }
    if (unindexedField) {
        throw new RuntimeException("Unindexed field: " + field);
    }
    for (Map.Entry<String, Integer> entry : termCountMap.entrySet()) {
        defaultOut.println(entry.getKey() + " (" + entry.getValue() + ")");
    }
}

From source file:com.facebook.presto.operator.HashAggregationOperator.java

License:Apache License

private Map<String, Long> GetGroupByResult() throws IOException {

    IndexReader reader = null;//from w w  w.  j a va  2  s  .  c  o  m
    Map<String, Long> returnMap = new HashMap<String, Long>();
    try {
        reader = DirectoryReader
                .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index")));
    } catch (IOException e) {
        e.printStackTrace();
    }
    IndexSearcher searcher = new IndexSearcher(reader);

    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "orderpriority");
    TermsEnum te = terms.iterator();
    while (te.next() != null) {

        String name = te.term().utf8ToString();
        int count = te.docFreq();
        returnMap.put(name, Long.valueOf(count));
    }

    return returnMap;
}

From source file:com.facebook.presto.operator.ScanFilterAndProjectOperator.java

License:Apache License

private Map<String, Long> getCountResult() throws IOException {

    IndexReader reader = null;//from  w w w. j av  a2s  .com
    Map<String, Long> returnMap = new HashMap<String, Long>();
    try {
        reader = DirectoryReader
                .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index")));
    } catch (IOException e) {
        e.printStackTrace();
    }
    IndexSearcher searcher = new IndexSearcher(reader);

    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "orderpriority");
    TermsEnum te = terms.iterator();
    while (te.next() != null) {

        String name = te.term().utf8ToString();
        int count = te.docFreq();
        returnMap.put(name, Long.valueOf(count));
    }

    return returnMap;
}

From source file:com.github.flaxsearch.resources.PostingsResource.java

License:Apache License

@GET
public TermData getPostings(@QueryParam("segment") Integer segment, @PathParam("field") String field,
        @PathParam("term") String term, @QueryParam("count") @DefaultValue("2147483647") int count)
        throws IOException {

    TermsEnum te = readerManager.findTermPostings(segment, field, term);
    Bits liveDocs = readerManager.getLiveDocs(segment);
    PostingsEnum pe = te.postings(null, PostingsEnum.NONE);

    int docFreq = te.docFreq();
    long totalTermFreq = te.totalTermFreq();

    int size = (docFreq < count) ? docFreq : count;
    int[] postings = new int[size];
    int docId;//from   w  w w  . j  a v a 2 s.  co m
    int i = 0;
    while ((docId = pe.nextDoc()) != PostingsEnum.NO_MORE_DOCS && i < count) {
        if (liveDocs != null && liveDocs.get(docId) == false)
            continue;
        postings[i] = docId;
        i++;
    }
    return new TermData(term, docFreq, totalTermFreq, postings);
}

From source file:com.globalsight.ling.lucene.HighFreqTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    IndexReader reader = null;//  w ww  . j  a  v a  2  s.c o  m
    if (args.length == 1) {
        SimpleFSDirectory fsd = new SimpleFSDirectory(new File(args[0]));
        reader = DirectoryReader.open(fsd);
    } else {
        usage();
        System.exit(1);
    }

    TermInfoQueue tiq = new TermInfoQueue(numTerms);
    //TODO: IS field right?
    String field = IndexDocument.TEXT;
    Terms terms = reader.getTermVector(0, field);
    //TermEnum terms = reader.terms();
    TermsEnum termsEnum = terms.iterator(null);

    BytesRef next = null;

    while ((next = termsEnum.next()) != null) {
        tiq.insertWithOverflow(new TermInfo(new Term(field, termsEnum.term()), termsEnum.docFreq()));
    }

    while (tiq.size() != 0) {
        TermInfo termInfo = (TermInfo) tiq.pop();
        System.out.println(termInfo.term + " " + termInfo.docFreq);
    }

    reader.close();
}

From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java

License:Apache License

/**
 * This function is building the frame of the cache. The cache is storing the
 * word occurrences to the memory after those searched once. This cache can
 * made 2-100x speedup in proper use, but can eat lot of memory. There is an
 * option to lower the memory consume, if a word have really low occurrence in
 * the index you could filter it out. The other parameter is switching between
 * the term searching, if it true, just the terms in the skeleton will be
 * searched, but if it false the terms whoes not in the cache will be searched
 * out too (but not cached).//from  www. j  ava2s . c o m
 *
 * @param minTermOccurrenceInCache Lower cache size with higher value.
 * @param justCachedTerms          The switch for fully exclude low occurrence docs.
 * @throws IOException If there is a low-level I/O error.
 */
public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException {
    this.justCachedTerms = justCachedTerms;

    this.docsWithClassSize = countDocsWithClass();
    termCClassHitCache.clear();
    cclasses.clear();
    classTermFreq.clear();

    // build the cache for the word
    Map<String, Long> frequencyMap = new HashMap<>();
    for (String textFieldName : textFieldNames) {
        TermsEnum termsEnum = leafReader.terms(textFieldName).iterator();
        while (termsEnum.next() != null) {
            BytesRef term = termsEnum.term();
            String termText = term.utf8ToString();
            long frequency = termsEnum.docFreq();
            Long lastfreq = frequencyMap.get(termText);
            if (lastfreq != null)
                frequency += lastfreq;
            frequencyMap.put(termText, frequency);
        }
    }
    for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) {
        if (entry.getValue() > minTermOccurrenceInCache) {
            termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>());
        }
    }

    // fill the class list
    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    while ((termsEnum.next()) != null) {
        cclasses.add(BytesRef.deepCopyOf(termsEnum.term()));
    }
    // fill the classTermFreq map
    for (BytesRef cclass : cclasses) {
        double avgNumberOfUniqueTerms = 0;
        for (String textFieldName : textFieldNames) {
            terms = MultiFields.getTerms(leafReader, textFieldName);
            long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
            avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount();
        }
        int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass));
        classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC);
    }
}