List of usage examples for org.apache.lucene.index TermsEnum docFreq
public abstract int docFreq() throws IOException;
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseIndex(IndexReader d1Index) { try {/* w ww .j a v a2 s. co m*/ int[] documentIds = getDocumentIds(d1Index); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (termsEnum.docFreq() < 2) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock block = new UnilateralBlock(idsArray); blocks.add(block); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java
License:Open Source License
private List<Entry<String, Float>> getTermScoreList(Directory directory) throws CorruptIndexException, IOException { Map<String, Float> termScoreMap = new HashMap<>(); ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity(); try (IndexReader idxReader = DirectoryReader.open(directory)) { idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> { try { Terms terms = reader.terms(Constants.DOC_CONTENT); TermsEnum termsEnum = terms.iterator(); PostingsEnum postings = null; int docsNum = idxReader.numDocs(); BytesRef text;/*from w ww .j a v a2s .c o m*/ while ((text = termsEnum.next()) != null) { postings = termsEnum.postings(postings); while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postings.freq(); float tf = sim.tf(freq); float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs()); termScoreMap.put(text.utf8ToString(), BETA * (tf * idf)); } } } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } finally { try { idxReader.close(); } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } } }); } return new ArrayList<>(termScoreMap.entrySet()); }
From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java
License:Open Source License
private float getScore(Directory directory, String term) throws CorruptIndexException, IOException { try (IndexReader idxReader = DirectoryReader.open(directory)) { ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity(); for (LeafReaderContext context : idxReader.leaves()) { LeafReader reader = context.reader(); try { Terms terms = reader.terms(Constants.DOC_CONTENT); TermsEnum termsEnum = terms.iterator(); PostingsEnum postings = null; BytesRef text;/*from w ww.j ava2 s. co m*/ while ((text = termsEnum.next()) != null) { postings = termsEnum.postings(postings); if (text.utf8ToString().equalsIgnoreCase(term)) { while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postings.freq(); float tf = sim.tf(freq); float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs()); return tf * idf; } } } } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } } } return 0; }
From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION)); CommandLine cmdline = null;//from w w w. j a v a 2s. c om CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options); System.exit(-1); } String indexLocation = cmdline.getOptionValue(INDEX_OPTION); int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1; PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name); TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); long missingCnt = 0; int skippedTerms = 0; BytesRef bytes = new BytesRef(); while ((bytes = termsEnum.next()) != null) { byte[] buf = new byte[bytes.length]; System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length); String term = new String(buf, "UTF-8"); int df = termsEnum.docFreq(); long cf = termsEnum.totalTermFreq(); if (df < min) { skippedTerms++; missingCnt += cf; continue; } out.println(term + "\t" + df + "\t" + cf); } reader.close(); out.close(); System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt); }
From source file:com.basistech.lucene.tools.LuceneQueryTool.java
License:Apache License
private void enumerateTerms(String field) throws IOException { if (!allFieldNames.contains(field)) { throw new RuntimeException("Invalid field name: " + field); }// w w w . j a v a 2s . c o m List<LeafReaderContext> leaves = indexReader.leaves(); TermsEnum termsEnum; boolean unindexedField = true; Map<String, Integer> termCountMap = new TreeMap<>(); for (LeafReaderContext leaf : leaves) { Terms terms = leaf.reader().terms(field); if (terms == null) { continue; } unindexedField = false; termsEnum = terms.iterator(); BytesRef bytesRef; while ((bytesRef = termsEnum.next()) != null) { String term = bytesRef.utf8ToString(); if (termCountMap.containsKey(term)) { termCountMap.put(term, termsEnum.docFreq() + termCountMap.get(term)); } else { termCountMap.put(term, termsEnum.docFreq()); } } } if (unindexedField) { throw new RuntimeException("Unindexed field: " + field); } for (Map.Entry<String, Integer> entry : termCountMap.entrySet()) { defaultOut.println(entry.getKey() + " (" + entry.getValue() + ")"); } }
From source file:com.facebook.presto.operator.HashAggregationOperator.java
License:Apache License
private Map<String, Long> GetGroupByResult() throws IOException { IndexReader reader = null;//from w w w. j a va 2 s . c o m Map<String, Long> returnMap = new HashMap<String, Long>(); try { reader = DirectoryReader .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index"))); } catch (IOException e) { e.printStackTrace(); } IndexSearcher searcher = new IndexSearcher(reader); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "orderpriority"); TermsEnum te = terms.iterator(); while (te.next() != null) { String name = te.term().utf8ToString(); int count = te.docFreq(); returnMap.put(name, Long.valueOf(count)); } return returnMap; }
From source file:com.facebook.presto.operator.ScanFilterAndProjectOperator.java
License:Apache License
private Map<String, Long> getCountResult() throws IOException { IndexReader reader = null;//from w w w. j av a2s .com Map<String, Long> returnMap = new HashMap<String, Long>(); try { reader = DirectoryReader .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index"))); } catch (IOException e) { e.printStackTrace(); } IndexSearcher searcher = new IndexSearcher(reader); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "orderpriority"); TermsEnum te = terms.iterator(); while (te.next() != null) { String name = te.term().utf8ToString(); int count = te.docFreq(); returnMap.put(name, Long.valueOf(count)); } return returnMap; }
From source file:com.github.flaxsearch.resources.PostingsResource.java
License:Apache License
@GET public TermData getPostings(@QueryParam("segment") Integer segment, @PathParam("field") String field, @PathParam("term") String term, @QueryParam("count") @DefaultValue("2147483647") int count) throws IOException { TermsEnum te = readerManager.findTermPostings(segment, field, term); Bits liveDocs = readerManager.getLiveDocs(segment); PostingsEnum pe = te.postings(null, PostingsEnum.NONE); int docFreq = te.docFreq(); long totalTermFreq = te.totalTermFreq(); int size = (docFreq < count) ? docFreq : count; int[] postings = new int[size]; int docId;//from w w w . j a v a 2 s. co m int i = 0; while ((docId = pe.nextDoc()) != PostingsEnum.NO_MORE_DOCS && i < count) { if (liveDocs != null && liveDocs.get(docId) == false) continue; postings[i] = docId; i++; } return new TermData(term, docFreq, totalTermFreq, postings); }
From source file:com.globalsight.ling.lucene.HighFreqTerms.java
License:Apache License
public static void main(String[] args) throws Exception { IndexReader reader = null;// w ww . j a v a 2 s.c o m if (args.length == 1) { SimpleFSDirectory fsd = new SimpleFSDirectory(new File(args[0])); reader = DirectoryReader.open(fsd); } else { usage(); System.exit(1); } TermInfoQueue tiq = new TermInfoQueue(numTerms); //TODO: IS field right? String field = IndexDocument.TEXT; Terms terms = reader.getTermVector(0, field); //TermEnum terms = reader.terms(); TermsEnum termsEnum = terms.iterator(null); BytesRef next = null; while ((next = termsEnum.next()) != null) { tiq.insertWithOverflow(new TermInfo(new Term(field, termsEnum.term()), termsEnum.docFreq())); } while (tiq.size() != 0) { TermInfo termInfo = (TermInfo) tiq.pop(); System.out.println(termInfo.term + " " + termInfo.docFreq); } reader.close(); }
From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java
License:Apache License
/** * This function is building the frame of the cache. The cache is storing the * word occurrences to the memory after those searched once. This cache can * made 2-100x speedup in proper use, but can eat lot of memory. There is an * option to lower the memory consume, if a word have really low occurrence in * the index you could filter it out. The other parameter is switching between * the term searching, if it true, just the terms in the skeleton will be * searched, but if it false the terms whoes not in the cache will be searched * out too (but not cached).//from www. j ava2s . c o m * * @param minTermOccurrenceInCache Lower cache size with higher value. * @param justCachedTerms The switch for fully exclude low occurrence docs. * @throws IOException If there is a low-level I/O error. */ public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException { this.justCachedTerms = justCachedTerms; this.docsWithClassSize = countDocsWithClass(); termCClassHitCache.clear(); cclasses.clear(); classTermFreq.clear(); // build the cache for the word Map<String, Long> frequencyMap = new HashMap<>(); for (String textFieldName : textFieldNames) { TermsEnum termsEnum = leafReader.terms(textFieldName).iterator(); while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); String termText = term.utf8ToString(); long frequency = termsEnum.docFreq(); Long lastfreq = frequencyMap.get(termText); if (lastfreq != null) frequency += lastfreq; frequencyMap.put(termText, frequency); } } for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) { if (entry.getValue() > minTermOccurrenceInCache) { termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>()); } } // fill the class list Terms terms = MultiFields.getTerms(leafReader, classFieldName); TermsEnum termsEnum = terms.iterator(); while ((termsEnum.next()) != null) { cclasses.add(BytesRef.deepCopyOf(termsEnum.term())); } // fill the classTermFreq map for (BytesRef cclass : cclasses) { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); } int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass)); classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC); } }