List of usage examples for org.apache.lucene.index LeafReader getDocCount
@Override public final int getDocCount(String field) throws IOException
From source file:org.apache.tika.eval.tools.TopCommonTokenCounter.java
License:Apache License
private void execute(Path inputFile, Path commonTokensFile) throws Exception { Path luceneDir = Files.createTempDirectory("tika-eval-lucene-"); AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N); try {//from ww w . j av a 2 s .c om Directory directory = FSDirectory.open(luceneDir); AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1); Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); int maxLen = 1000000; int len = 0; try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) { List<Document> docs = new ArrayList<>(); try (BufferedReader reader = getReader(inputFile)) { String line = reader.readLine(); while (line != null) { len += line.length(); Document document = new Document(); document.add(new TextField(FIELD, line, Field.Store.NO)); docs.add(document); if (len > maxLen) { writer.addDocuments(docs); docs.clear(); len = 0; } line = reader.readLine(); } } if (docs.size() > 0) { writer.addDocuments(docs); } writer.commit(); writer.flush(); } try (IndexReader reader = DirectoryReader.open(directory)) { LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader); Terms terms = wrappedReader.terms(FIELD); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = wrappedReader.getDocCount(FIELD); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) { bytesRef = termsEnum.next(); continue; } if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) { String t = bytesRef.utf8ToString(); if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf)); } } bytesRef = termsEnum.next(); } } } finally { FileUtils.deleteDirectory(luceneDir.toFile()); } writeTopN(commonTokensFile, queue); }
From source file:org.tallison.gramreaper.terms.DumpTerms.java
License:Apache License
private void dumpTopNField(LeafReader leafReader, String field) throws IOException { AbstractTokenTFDFPriorityQueue queue = config.sort.equals(DumpTermsConfig.SORT.DF) ? new TokenDFPriorityQueue(config.topN) : new TokenTFPriorityQueue(config.topN); Terms terms = leafReader.terms(field); if (terms == null) { StringBuilder sb = new StringBuilder(); int i = 0; for (FieldInfo fieldInfo : leafReader.getFieldInfos()) { if (i++ > 0) { sb.append("\n"); }//from w ww . j av a 2 s . c o m sb.append(fieldInfo.name); } throw new RuntimeException("I can't find field \"" + field + "\".\n" + "I only see:\n" + sb.toString()); } TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = leafReader.getDocCount(field); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (config.minDocFreq > -1 && df < config.minDocFreq) { bytesRef = termsEnum.next(); continue; } if (config.minDocPercentage > -1.0d && (double) df / (double) docsWThisField < config.minDocPercentage) { bytesRef = termsEnum.next(); continue; } if (queue.top() == null || queue.size() < config.topN || (config.sort.equals(DumpTermsConfig.SORT.DF) ? df >= queue.top().df : tf > queue.top().tf)) { String t = bytesRef.utf8ToString(); if (!config.stopWords.contains(t) && !config.startWords.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf)); } } bytesRef = termsEnum.next(); } if (config.outputFile == null) { StringBuilder sb = new StringBuilder(); for (TokenDFTF tp : queue.getArray()) { System.out.println(getRow(sb, tp)); } } else if (Files.isDirectory(config.outputFile)) { writeTopN(config.outputFile.resolve(field), queue); } else { writeTopN(config.outputFile, queue); } }
From source file:suonos.lucene.fields.IndexedFieldCountsBuilder.java
License:Apache License
public IndexedFieldCountsBuilder addField(String fieldName, String filter) throws IOException { final IndexedField fld = models.indexedField(fieldName); final Map<String, IndexedFieldTermCount> valuesMap = AntLib.newHashMap(); final TIntIntHashMap ordCounts = new TIntIntHashMap(); if (filter != null) { filter = filter.toLowerCase();/*from ww w . jav a2 s . c o m*/ } // Get count of segments. // int sz = ir.leaves().size(); for (int i = 0; i != sz; i++) { // Get the segment reader. // LeafReader lr = ir.leaves().get(i).reader(); // Doc count for field. Eg "album_genres" // lr.getDocCount(fld.getName()); // Get all documents that have the field "album_genres" // Bits docs = lr.getDocsWithField(fld.getName()); ordCounts.clear(); // Enumerate the field terms. // if (fld.isDocValues()) { if (fld.isMultiValue()) { // docvalues & multivalue is a SortedSetDocValues // Per-Document values in a SortedDocValues are // deduplicated, dereferenced, and sorted into a dictionary // of // unique values. A pointer to the dictionary value // (ordinal) can be retrieved for each document. // Ordinals are dense and in increasing sorted order. // SortedSetDocValues set = lr.getSortedSetDocValues(fld.getName()); if (set != null) { // For all documents that have the field "album_genres": // for (int docId = 0; docId != docs.length(); docId++) { if (docs.get(docId)) { // Enumerate the set of [terms] of // "album_genres" for the document represented // by docId. // Each ord represents the term value. // set.setDocument(docId); // For each term bump up the frequency. // long ord; while ((ord = set.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { ordCounts.adjustOrPutValue((int) ord, 1, 1); System.out.println("term=" + set.lookupOrd(ord).utf8ToString()); } } } TermsEnum te = set.termsEnum(); BytesRef term; while ((term = te.next()) != null) { int ord = (int) te.ord(); add(fld, valuesMap, filter, term, ordCounts.get(ord)); } } } else { SortedDocValues set = lr.getSortedDocValues(fld.getName()); if (set != null) { // For all documents that have the field "album_genres": // for (int docId = 0; docId != docs.length(); docId++) { if (docs.get(docId)) { // Get the term - Classical, Rock, etc. // BytesRef term = set.get(docId); add(fld, valuesMap, filter, term, 1); } } } } } else { // Normal field, not a doc value. // Terms terms = lr.terms(fld.getName()); TermsEnum te = terms.iterator(); BytesRef term; while ((term = te.next()) != null) { add(fld, valuesMap, filter, term, te.docFreq()); } } /* * SORTED doc[0] = "aardvark" doc[1] = "beaver" doc[2] = "aardvark" * * doc[0] = 0 doc[1] = 1 doc[2] = 0 * * term[0] = "aardvark" term[1] = "beaver" */ // http://127.0.0.1:8080/api/facets?fields=track_title_a // the above should return B:(4) because titles starting with B are // 4! } // Get the array of term counters. // IndexedFieldTermCount[] list = valuesMap.values().toArray(new IndexedFieldTermCount[0]); // Sort by term. // Arrays.sort(list); // add to the map. // this.fieldCounts.put(fld.getName(), list); return this; }