Example usage for org.apache.lucene.index LeafReader getDocCount

List of usage examples for org.apache.lucene.index LeafReader getDocCount

Introduction

In this page you can find the example usage for org.apache.lucene.index LeafReader getDocCount.

Prototype

@Override
    public final int getDocCount(String field) throws IOException 

Source Link

Usage

From source file:org.apache.tika.eval.tools.TopCommonTokenCounter.java

License:Apache License

private void execute(Path inputFile, Path commonTokensFile) throws Exception {
    Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
    AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
    try {//from   ww  w  .  j av a 2 s .c om
        Directory directory = FSDirectory.open(luceneDir);
        AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

        Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        int maxLen = 1000000;
        int len = 0;
        try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
            List<Document> docs = new ArrayList<>();
            try (BufferedReader reader = getReader(inputFile)) {
                String line = reader.readLine();
                while (line != null) {
                    len += line.length();
                    Document document = new Document();
                    document.add(new TextField(FIELD, line, Field.Store.NO));
                    docs.add(document);
                    if (len > maxLen) {
                        writer.addDocuments(docs);
                        docs.clear();
                        len = 0;
                    }
                    line = reader.readLine();
                }
            }
            if (docs.size() > 0) {
                writer.addDocuments(docs);
            }
            writer.commit();
            writer.flush();
        }
        try (IndexReader reader = DirectoryReader.open(directory)) {
            LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
            Terms terms = wrappedReader.terms(FIELD);
            TermsEnum termsEnum = terms.iterator();
            BytesRef bytesRef = termsEnum.next();
            int docsWThisField = wrappedReader.getDocCount(FIELD);
            while (bytesRef != null) {
                int df = termsEnum.docFreq();
                long tf = termsEnum.totalTermFreq();
                if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                    bytesRef = termsEnum.next();
                    continue;
                }

                if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
                    String t = bytesRef.utf8ToString();
                    if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) {
                        queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                    }

                }
                bytesRef = termsEnum.next();
            }
        }
    } finally {
        FileUtils.deleteDirectory(luceneDir.toFile());
    }

    writeTopN(commonTokensFile, queue);

}

From source file:org.tallison.gramreaper.terms.DumpTerms.java

License:Apache License

private void dumpTopNField(LeafReader leafReader, String field) throws IOException {
    AbstractTokenTFDFPriorityQueue queue = config.sort.equals(DumpTermsConfig.SORT.DF)
            ? new TokenDFPriorityQueue(config.topN)
            : new TokenTFPriorityQueue(config.topN);
    Terms terms = leafReader.terms(field);
    if (terms == null) {
        StringBuilder sb = new StringBuilder();
        int i = 0;
        for (FieldInfo fieldInfo : leafReader.getFieldInfos()) {
            if (i++ > 0) {
                sb.append("\n");
            }//from w ww .  j  av a 2 s  .  c  o  m
            sb.append(fieldInfo.name);

        }
        throw new RuntimeException("I can't find field \"" + field + "\".\n" + "I only see:\n" + sb.toString());
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    int docsWThisField = leafReader.getDocCount(field);
    while (bytesRef != null) {
        int df = termsEnum.docFreq();
        long tf = termsEnum.totalTermFreq();
        if (config.minDocFreq > -1 && df < config.minDocFreq) {
            bytesRef = termsEnum.next();
            continue;
        }
        if (config.minDocPercentage > -1.0d
                && (double) df / (double) docsWThisField < config.minDocPercentage) {
            bytesRef = termsEnum.next();
            continue;
        }

        if (queue.top() == null || queue.size() < config.topN
                || (config.sort.equals(DumpTermsConfig.SORT.DF) ? df >= queue.top().df : tf > queue.top().tf)) {
            String t = bytesRef.utf8ToString();
            if (!config.stopWords.contains(t) && !config.startWords.contains(t)) {

                queue.insertWithOverflow(new TokenDFTF(t, df, tf));
            }
        }
        bytesRef = termsEnum.next();
    }
    if (config.outputFile == null) {
        StringBuilder sb = new StringBuilder();
        for (TokenDFTF tp : queue.getArray()) {
            System.out.println(getRow(sb, tp));
        }
    } else if (Files.isDirectory(config.outputFile)) {
        writeTopN(config.outputFile.resolve(field), queue);
    } else {
        writeTopN(config.outputFile, queue);
    }
}

From source file:suonos.lucene.fields.IndexedFieldCountsBuilder.java

License:Apache License

public IndexedFieldCountsBuilder addField(String fieldName, String filter) throws IOException {

    final IndexedField fld = models.indexedField(fieldName);
    final Map<String, IndexedFieldTermCount> valuesMap = AntLib.newHashMap();
    final TIntIntHashMap ordCounts = new TIntIntHashMap();

    if (filter != null) {
        filter = filter.toLowerCase();/*from  ww w .  jav a2 s  .  c  o m*/
    }

    // Get count of segments.
    //
    int sz = ir.leaves().size();

    for (int i = 0; i != sz; i++) {
        // Get the segment reader.
        //
        LeafReader lr = ir.leaves().get(i).reader();

        // Doc count for field. Eg "album_genres"
        //
        lr.getDocCount(fld.getName());

        // Get all documents that have the field "album_genres"
        //
        Bits docs = lr.getDocsWithField(fld.getName());
        ordCounts.clear();

        // Enumerate the field terms.
        //
        if (fld.isDocValues()) {
            if (fld.isMultiValue()) {
                // docvalues & multivalue is a SortedSetDocValues
                // Per-Document values in a SortedDocValues are
                // deduplicated, dereferenced, and sorted into a dictionary
                // of
                // unique values. A pointer to the dictionary value
                // (ordinal) can be retrieved for each document.
                // Ordinals are dense and in increasing sorted order.
                //
                SortedSetDocValues set = lr.getSortedSetDocValues(fld.getName());

                if (set != null) {
                    // For all documents that have the field "album_genres":
                    //
                    for (int docId = 0; docId != docs.length(); docId++) {
                        if (docs.get(docId)) {
                            // Enumerate the set of [terms] of
                            // "album_genres" for the document represented
                            // by docId.
                            // Each ord represents the term value.
                            //
                            set.setDocument(docId);

                            // For each term bump up the frequency.
                            //
                            long ord;
                            while ((ord = set.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                                ordCounts.adjustOrPutValue((int) ord, 1, 1);

                                System.out.println("term=" + set.lookupOrd(ord).utf8ToString());
                            }
                        }
                    }

                    TermsEnum te = set.termsEnum();
                    BytesRef term;

                    while ((term = te.next()) != null) {

                        int ord = (int) te.ord();

                        add(fld, valuesMap, filter, term, ordCounts.get(ord));
                    }

                }

            } else {
                SortedDocValues set = lr.getSortedDocValues(fld.getName());

                if (set != null) {
                    // For all documents that have the field "album_genres":
                    //
                    for (int docId = 0; docId != docs.length(); docId++) {
                        if (docs.get(docId)) {
                            // Get the term - Classical, Rock, etc.
                            //
                            BytesRef term = set.get(docId);

                            add(fld, valuesMap, filter, term, 1);
                        }
                    }
                }
            }
        } else {
            // Normal field, not a doc value.
            //
            Terms terms = lr.terms(fld.getName());
            TermsEnum te = terms.iterator();

            BytesRef term;
            while ((term = te.next()) != null) {
                add(fld, valuesMap, filter, term, te.docFreq());
            }
        }

        /*
         * SORTED doc[0] = "aardvark" doc[1] = "beaver" doc[2] = "aardvark"
         * 
         * doc[0] = 0 doc[1] = 1 doc[2] = 0
         * 
         * term[0] = "aardvark" term[1] = "beaver"
         */

        // http://127.0.0.1:8080/api/facets?fields=track_title_a
        // the above should return B:(4) because titles starting with B are
        // 4!
    }

    // Get the array of term counters.
    //
    IndexedFieldTermCount[] list = valuesMap.values().toArray(new IndexedFieldTermCount[0]);

    // Sort by term.
    //
    Arrays.sort(list);

    // add to the map.
    //
    this.fieldCounts.put(fld.getName(), list);

    return this;
}