List of usage examples for org.apache.lucene.index TermsEnum totalTermFreq
public abstract long totalTermFreq() throws IOException;
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field *//*from w w w . ja v a 2 s . c o m*/ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Print a term vector for debugging//from w w w . jav a 2s. c om * * @param vector List of terms and their frequencies for a doc/field * @throws IOException */ @SuppressWarnings("unused") private void print(Terms vector) throws IOException { if (vector == null) return; final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; // termsEnum.docFreq() = 1, // The returned Fields instance acts like a single-document inverted index HashMap<String, Long> map = new HashMap<String, Long>(); while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); map.put(spare.toString(), termsEnum.totalTermFreq()); } @SuppressWarnings("unchecked") Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]); Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() { public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (Map.Entry<String, Long> e : a) { System.out.print(e.getKey() + ":" + e.getValue() + " "); } System.out.println(); }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, String id, String field, double weight) throws IOException { Query query = new TermQuery(new Term("id", id)); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum; termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text;/*w w w .ja va 2 s . c o m*/ while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms double minFreq = reader.numDocs() * 0.0001; double maxFreq = reader.numDocs() / 3; //double minFreq = 0; //double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id, String field, double weight) throws IOException { Query query = NumericRangeQuery.newLongRange("id", id, id, true, true); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum; termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text;/*from w w w .ja v a 2 s . co m*/ while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms //double minFreq = reader.numDocs() * 0.0001; //double maxFreq = reader.numDocs() / 3; double minFreq = 0; double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION)); CommandLine cmdline = null;/*from www . j a v a 2 s . c o m*/ CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options); System.exit(-1); } String indexLocation = cmdline.getOptionValue(INDEX_OPTION); int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1; PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name); TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); long missingCnt = 0; int skippedTerms = 0; BytesRef bytes = new BytesRef(); while ((bytes = termsEnum.next()) != null) { byte[] buf = new byte[bytes.length]; System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length); String term = new String(buf, "UTF-8"); int df = termsEnum.docFreq(); long cf = termsEnum.totalTermFreq(); if (df < min) { skippedTerms++; missingCnt += cf; continue; } out.println(term + "\t" + df + "\t" + cf); } reader.close(); out.close(); System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt); }
From source file:ci6226.facetsearch.java
public static void main(String[] args) throws Exception { String index = "./myindex"; String field = "text"; String queries = null;/*from w ww . jav a 2 s .com*/ int hitsPerPage = 10; boolean raw = false; //http://lucene.apache.org/core/4_0_0/facet/org/apache/lucene/facet/doc-files/userguide.html#facet_accumulation IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); // :Post-Release-Update-Version.LUCENE_XY: //TODO: SAME AS HOW U BUILD INDEX Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); BufferedReader in = null; if (queries != null) { in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } // :Post-Release-Update-Version.LUCENE_XY: QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer); while (true) { System.out.println("Enter query: "); String line = in.readLine(); line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); Date start = new Date(); searcher.search(query, null, 100); Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); TopDocs results = searcher.search(query, 5 * hitsPerPage); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; //N= max docs //df = totoal matched doc //idf=log(N/df) for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.println(ANSI_BLUE + (i + 1) + ANSI_RESET + "\nScore=\t" + hits[i].score); String rtext = doc.get(field); System.out.println("Text=\t" + rtext); Terms vector = reader.getTermVector(i, "text"); if (vector == null) continue; // System.out.println(vector.getSumDocFreq()); // Terms vector = reader.getTermVector(hits[i].doc, field); //hits[i].doc=docID TermsEnum termsEnum = vector.iterator(null); termsEnum = vector.iterator(termsEnum); Map<String, Integer> frequencies = new HashMap<>(); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); frequencies.put(term, freq); // System.out.println("Time: "+term + " idef "+freq); } } // String[] facetCatlog={""}; System.out.println(numTotalHits + " total matching documents"); } reader.close(); }
From source file:com.github.flaxsearch.resources.PostingsResource.java
License:Apache License
@GET public TermData getPostings(@QueryParam("segment") Integer segment, @PathParam("field") String field, @PathParam("term") String term, @QueryParam("count") @DefaultValue("2147483647") int count) throws IOException { TermsEnum te = readerManager.findTermPostings(segment, field, term); Bits liveDocs = readerManager.getLiveDocs(segment); PostingsEnum pe = te.postings(null, PostingsEnum.NONE); int docFreq = te.docFreq(); long totalTermFreq = te.totalTermFreq(); int size = (docFreq < count) ? docFreq : count; int[] postings = new int[size]; int docId;/*ww w. j ava 2 s . c om*/ int i = 0; while ((docId = pe.nextDoc()) != PostingsEnum.NO_MORE_DOCS && i < count) { if (liveDocs != null && liveDocs.get(docId) == false) continue; postings[i] = docId; i++; } return new TermData(term, docFreq, totalTermFreq, postings); }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
/** * {@inheritDoc}/*from w ww . j a va 2 s . c om*/ */ @Override public void train(LeafReader leafReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) throws IOException { this.textTerms = MultiFields.getTerms(leafReader, textFieldName); if (textTerms == null) { throw new IOException("term vectors need to be available for field " + textFieldName); } this.analyzer = analyzer; this.textFieldName = textFieldName; if (threshold == null || threshold == 0d) { // automatic assign a threshold long sumDocFreq = leafReader.getSumDocFreq(textFieldName); if (sumDocFreq != -1) { this.threshold = (double) sumDocFreq / 2d; } else { throw new IOException("threshold cannot be assigned since term vectors for field " + textFieldName + " do not exist"); } } // TODO : remove this map as soon as we have a writable FST SortedMap<String, Double> weights = new TreeMap<>(); TermsEnum termsEnum = textTerms.iterator(); BytesRef textTerm; while ((textTerm = termsEnum.next()) != null) { weights.put(textTerm.utf8ToString(), (double) termsEnum.totalTermFreq()); } updateFST(weights); IndexSearcher indexSearcher = new IndexSearcher(leafReader); int batchCount = 0; BooleanQuery q = new BooleanQuery(); q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, "*")), BooleanClause.Occur.MUST)); if (query != null) { q.add(new BooleanClause(query, BooleanClause.Occur.MUST)); } // run the search and use stored field values for (ScoreDoc scoreDoc : indexSearcher.search(q, Integer.MAX_VALUE).scoreDocs) { Document doc = indexSearcher.doc(scoreDoc.doc); IndexableField textField = doc.getField(textFieldName); // get the expected result IndexableField classField = doc.getField(classFieldName); if (textField != null && classField != null) { // assign class to the doc ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue()); Boolean assignedClass = classificationResult.getAssignedClass(); Boolean correctClass = Boolean.valueOf(classField.stringValue()); long modifier = correctClass.compareTo(assignedClass); if (modifier != 0) { updateWeights(leafReader, scoreDoc.doc, assignedClass, weights, modifier, batchCount % batchSize == 0); } batchCount++; } } weights.clear(); // free memory while waiting for GC }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateWeights(LeafReader leafReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException { TermsEnum cte = textTerms.iterator(); // get the doc term vectors Terms terms = leafReader.getTermVector(docId, textFieldName); if (terms == null) { throw new IOException("term vectors must be stored for field " + textFieldName); }/* w w w . j a v a2 s.c o m*/ TermsEnum termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { cte.seekExact(term); if (assignedClass != null) { long termFreqLocal = termsEnum.totalTermFreq(); // update weights Long previousValue = Util.get(fst, term); String termString = term.utf8ToString(); weights.put(termString, previousValue + modifier * termFreqLocal); } } if (updateFST) { updateFST(weights); } }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtils.java
License:Apache License
/** * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc * @param docTerms term vectors for a given document * @param fieldTerms field term vectors/*from w w w.j ava 2 s. c o m*/ * @return a sparse vector of <code>Double</code>s as an array * @throws IOException in case accessing the underlying index fails */ public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException { TermsEnum fieldTermsEnum = fieldTerms.iterator(); Double[] freqVector = null; if (docTerms != null && fieldTerms.size() > -1) { freqVector = new Double[(int) fieldTerms.size()]; int i = 0; TermsEnum docTermsEnum = docTerms.iterator(); BytesRef term; while ((term = fieldTermsEnum.next()) != null) { TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term); if (seekStatus.equals(TermsEnum.SeekStatus.END)) { docTermsEnum = docTerms.iterator(); } if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) { long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document freqVector[i] = Long.valueOf(termFreqLocal).doubleValue(); } else { freqVector[i] = 0d; } i++; } } return freqVector; }