Example usage for org.apache.lucene.index TermsEnum docFreq

List of usage examples for org.apache.lucene.index TermsEnum docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index TermsEnum docFreq.

Prototype

public abstract int docFreq() throws IOException;

Source Link

Document

Returns the number of documents containing the current term.

Usage

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseIndex(IndexReader d1Index) {
    try {/* w  ww  .j  a  v  a2  s. co m*/
        int[] documentIds = getDocumentIds(d1Index);
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (termsEnum.docFreq() < 2) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                UnilateralBlock block = new UnilateralBlock(idsArray);
                blocks.add(block);
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private List<Entry<String, Float>> getTermScoreList(Directory directory)
        throws CorruptIndexException, IOException {

    Map<String, Float> termScoreMap = new HashMap<>();

    ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> {
            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;
                int docsNum = idxReader.numDocs();

                BytesRef text;/*from   w  ww .j a v  a2s  .c  o  m*/
                while ((text = termsEnum.next()) != null) {

                    postings = termsEnum.postings(postings);

                    while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                        int freq = postings.freq();
                        float tf = sim.tf(freq);
                        float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                        termScoreMap.put(text.utf8ToString(), BETA * (tf * idf));
                    }
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                try {
                    idxReader.close();
                } catch (IOException ex) {
                    Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        });

    }

    return new ArrayList<>(termScoreMap.entrySet());
}

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private float getScore(Directory directory, String term) throws CorruptIndexException, IOException {

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

        for (LeafReaderContext context : idxReader.leaves()) {
            LeafReader reader = context.reader();

            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;

                BytesRef text;/*from   w ww.j  ava2  s.  co m*/
                while ((text = termsEnum.next()) != null) {
                    postings = termsEnum.postings(postings);
                    if (text.utf8ToString().equalsIgnoreCase(term)) {

                        while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                            int freq = postings.freq();
                            float tf = sim.tf(freq);
                            float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                            return tf * idf;
                        }
                    }
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

    }

    return 0;
}

From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION));

    CommandLine cmdline = null;//from w  w  w. j  a  v  a  2s.  c om
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
    int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
    TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);

    long missingCnt = 0;
    int skippedTerms = 0;
    BytesRef bytes = new BytesRef();
    while ((bytes = termsEnum.next()) != null) {
        byte[] buf = new byte[bytes.length];
        System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
        String term = new String(buf, "UTF-8");
        int df = termsEnum.docFreq();
        long cf = termsEnum.totalTermFreq();

        if (df < min) {
            skippedTerms++;
            missingCnt += cf;
            continue;
        }

        out.println(term + "\t" + df + "\t" + cf);
    }

    reader.close();
    out.close();
    System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
}

From source file:com.basistech.lucene.tools.LuceneQueryTool.java

License:Apache License

private void enumerateTerms(String field) throws IOException {
    if (!allFieldNames.contains(field)) {
        throw new RuntimeException("Invalid field name: " + field);
    }// w  w  w  . j a v a 2s . c o m
    List<LeafReaderContext> leaves = indexReader.leaves();
    TermsEnum termsEnum;
    boolean unindexedField = true;
    Map<String, Integer> termCountMap = new TreeMap<>();
    for (LeafReaderContext leaf : leaves) {
        Terms terms = leaf.reader().terms(field);
        if (terms == null) {
            continue;
        }
        unindexedField = false;
        termsEnum = terms.iterator();
        BytesRef bytesRef;
        while ((bytesRef = termsEnum.next()) != null) {
            String term = bytesRef.utf8ToString();
            if (termCountMap.containsKey(term)) {
                termCountMap.put(term, termsEnum.docFreq() + termCountMap.get(term));
            } else {
                termCountMap.put(term, termsEnum.docFreq());
            }
        }
    }
    if (unindexedField) {
        throw new RuntimeException("Unindexed field: " + field);
    }
    for (Map.Entry<String, Integer> entry : termCountMap.entrySet()) {
        defaultOut.println(entry.getKey() + " (" + entry.getValue() + ")");
    }
}

From source file:com.facebook.presto.operator.HashAggregationOperator.java

License:Apache License

private Map<String, Long> GetGroupByResult() throws IOException {

    IndexReader reader = null;//from w w  w.  j a va  2  s  .  c  o  m
    Map<String, Long> returnMap = new HashMap<String, Long>();
    try {
        reader = DirectoryReader
                .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index")));
    } catch (IOException e) {
        e.printStackTrace();
    }
    IndexSearcher searcher = new IndexSearcher(reader);

    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "orderpriority");
    TermsEnum te = terms.iterator();
    while (te.next() != null) {

        String name = te.term().utf8ToString();
        int count = te.docFreq();
        returnMap.put(name, Long.valueOf(count));
    }

    return returnMap;
}

From source file:com.facebook.presto.operator.ScanFilterAndProjectOperator.java

License:Apache License

private Map<String, Long> getCountResult() throws IOException {

    IndexReader reader = null;//from  w w w. j av  a2s  .com
    Map<String, Long> returnMap = new HashMap<String, Long>();
    try {
        reader = DirectoryReader
                .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index")));
    } catch (IOException e) {
        e.printStackTrace();
    }
    IndexSearcher searcher = new IndexSearcher(reader);

    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "orderpriority");
    TermsEnum te = terms.iterator();
    while (te.next() != null) {

        String name = te.term().utf8ToString();
        int count = te.docFreq();
        returnMap.put(name, Long.valueOf(count));
    }

    return returnMap;
}

From source file:com.github.flaxsearch.resources.PostingsResource.java

License:Apache License

@GET
public TermData getPostings(@QueryParam("segment") Integer segment, @PathParam("field") String field,
        @PathParam("term") String term, @QueryParam("count") @DefaultValue("2147483647") int count)
        throws IOException {

    TermsEnum te = readerManager.findTermPostings(segment, field, term);
    Bits liveDocs = readerManager.getLiveDocs(segment);
    PostingsEnum pe = te.postings(null, PostingsEnum.NONE);

    int docFreq = te.docFreq();
    long totalTermFreq = te.totalTermFreq();

    int size = (docFreq < count) ? docFreq : count;
    int[] postings = new int[size];
    int docId;//from   w  w w  . j  a v a 2 s.  co m
    int i = 0;
    while ((docId = pe.nextDoc()) != PostingsEnum.NO_MORE_DOCS && i < count) {
        if (liveDocs != null && liveDocs.get(docId) == false)
            continue;
        postings[i] = docId;
        i++;
    }
    return new TermData(term, docFreq, totalTermFreq, postings);
}

From source file:com.globalsight.ling.lucene.HighFreqTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    IndexReader reader = null;//  w ww  . j  a  v a  2  s.c o  m
    if (args.length == 1) {
        SimpleFSDirectory fsd = new SimpleFSDirectory(new File(args[0]));
        reader = DirectoryReader.open(fsd);
    } else {
        usage();
        System.exit(1);
    }

    TermInfoQueue tiq = new TermInfoQueue(numTerms);
    //TODO: IS field right?
    String field = IndexDocument.TEXT;
    Terms terms = reader.getTermVector(0, field);
    //TermEnum terms = reader.terms();
    TermsEnum termsEnum = terms.iterator(null);

    BytesRef next = null;

    while ((next = termsEnum.next()) != null) {
        tiq.insertWithOverflow(new TermInfo(new Term(field, termsEnum.term()), termsEnum.docFreq()));
    }

    while (tiq.size() != 0) {
        TermInfo termInfo = (TermInfo) tiq.pop();
        System.out.println(termInfo.term + " " + termInfo.docFreq);
    }

    reader.close();
}

From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java

License:Apache License

/**
 * This function is building the frame of the cache. The cache is storing the
 * word occurrences to the memory after those searched once. This cache can
 * made 2-100x speedup in proper use, but can eat lot of memory. There is an
 * option to lower the memory consume, if a word have really low occurrence in
 * the index you could filter it out. The other parameter is switching between
 * the term searching, if it true, just the terms in the skeleton will be
 * searched, but if it false the terms whoes not in the cache will be searched
 * out too (but not cached).//from  www. j  ava2s . c o m
 *
 * @param minTermOccurrenceInCache Lower cache size with higher value.
 * @param justCachedTerms          The switch for fully exclude low occurrence docs.
 * @throws IOException If there is a low-level I/O error.
 */
public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException {
    this.justCachedTerms = justCachedTerms;

    this.docsWithClassSize = countDocsWithClass();
    termCClassHitCache.clear();
    cclasses.clear();
    classTermFreq.clear();

    // build the cache for the word
    Map<String, Long> frequencyMap = new HashMap<>();
    for (String textFieldName : textFieldNames) {
        TermsEnum termsEnum = leafReader.terms(textFieldName).iterator();
        while (termsEnum.next() != null) {
            BytesRef term = termsEnum.term();
            String termText = term.utf8ToString();
            long frequency = termsEnum.docFreq();
            Long lastfreq = frequencyMap.get(termText);
            if (lastfreq != null)
                frequency += lastfreq;
            frequencyMap.put(termText, frequency);
        }
    }
    for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) {
        if (entry.getValue() > minTermOccurrenceInCache) {
            termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>());
        }
    }

    // fill the class list
    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    while ((termsEnum.next()) != null) {
        cclasses.add(BytesRef.deepCopyOf(termsEnum.term()));
    }
    // fill the classTermFreq map
    for (BytesRef cclass : cclasses) {
        double avgNumberOfUniqueTerms = 0;
        for (String textFieldName : textFieldNames) {
            terms = MultiFields.getTerms(leafReader, textFieldName);
            long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
            avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount();
        }
        int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass));
        classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC);
    }
}