List of usage examples for org.apache.lucene.index TermsEnum EMPTY
TermsEnum EMPTY
To view the source code for org.apache.lucene.index TermsEnum EMPTY.
Click Source Link
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, String id, String field, double weight) throws IOException { Query query = new TermQuery(new Term("id", id)); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum;//from ww w . j a va2s .c o m termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms double minFreq = reader.numDocs() * 0.0001; double maxFreq = reader.numDocs() / 3; //double minFreq = 0; //double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id, String field, double weight) throws IOException { Query query = NumericRangeQuery.newLongRange("id", id, id, true, true); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum;/*from w ww .jav a2 s .com*/ termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms //double minFreq = reader.numDocs() * 0.0001; //double maxFreq = reader.numDocs() / 3; double minFreq = 0; double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION)); CommandLine cmdline = null;/*from w ww . ja v a2 s. co m*/ CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options); System.exit(-1); } String indexLocation = cmdline.getOptionValue(INDEX_OPTION); int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1; PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name); TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); long missingCnt = 0; int skippedTerms = 0; BytesRef bytes = new BytesRef(); while ((bytes = termsEnum.next()) != null) { byte[] buf = new byte[bytes.length]; System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length); String term = new String(buf, "UTF-8"); int df = termsEnum.docFreq(); long cf = termsEnum.totalTermFreq(); if (df < min) { skippedTerms++; missingCnt += cf; continue; } out.println(term + "\t" + df + "\t" + cf); } reader.close(); out.close(); System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt); }
From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java
License:Open Source License
public Map<Integer, Set<HighlightTerm>> highlight(Set<Integer> docIds, Set<String> fields) { try {//w ww . java 2 s . com Map<Integer, Set<HighlightTerm>> termMap = new HashMap<Integer, Set<HighlightTerm>>(); Map<Integer, Document> idToDocMap = new HashMap<Integer, Document>(); Map<Integer, CoordinateStorage> idToCoordinateStorageMap = new HashMap<Integer, CoordinateStorage>(); Map<Integer, Set<Integer>> myLeaves = new HashMap<Integer, Set<Integer>>(); for (int docId : docIds) { Document luceneDoc = indexSearcher.doc(docId); idToDocMap.put(docId, luceneDoc); JochreIndexDocument jochreDoc = searchService.getJochreIndexDocument(indexSearcher, docId); idToCoordinateStorageMap.put(docId, jochreDoc.getCoordinateStorage()); termMap.put(docId, new TreeSet<HighlightTerm>()); int leaf = ReaderUtil.subIndex(docId, leaves); Set<Integer> docsPerLeaf = myLeaves.get(leaf); if (docsPerLeaf == null) { docsPerLeaf = new HashSet<Integer>(); myLeaves.put(leaf, docsPerLeaf); } docsPerLeaf.add(docId); } for (int leaf : myLeaves.keySet()) { if (LOG.isTraceEnabled()) LOG.trace("Searching leaf " + leaf); Set<Integer> docsPerLeaf = myLeaves.get(leaf); AtomicReaderContext subContext = leaves.get(leaf); AtomicReader atomicReader = subContext.reader(); int fieldCounter = 0; for (String field : fields) { fieldCounter++; if (LOG.isTraceEnabled()) LOG.trace("Field " + fieldCounter + ": " + field); Terms atomicReaderTerms = atomicReader.terms(field); if (atomicReaderTerms == null) { continue; // nothing to do } TermsEnum termsEnum = atomicReaderTerms.iterator(TermsEnum.EMPTY); int termCounter = 0; for (BytesRef term : terms) { termCounter++; if (LOG.isTraceEnabled()) LOG.trace("Searching for term " + termCounter + ": " + term.utf8ToString() + " in field " + field); if (!termsEnum.seekExact(term)) { continue; // term not found } DocsAndPositionsEnum docPosEnum = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); int relativeDocId = docPosEnum.nextDoc(); while (relativeDocId != DocsAndPositionsEnum.NO_MORE_DOCS) { int docId = subContext.docBase + relativeDocId; if (docsPerLeaf.contains(docId)) { Document doc = idToDocMap.get(docId); Set<HighlightTerm> highlightTerms = termMap.get(docId); //Retrieve the term frequency in the current document int freq = docPosEnum.freq(); if (LOG.isTraceEnabled()) { String extId = doc.get("id"); String path = doc.get("path"); LOG.trace("Found " + freq + " matches for doc " + docId + ", extId: " + extId + ", path: " + path); } for (int i = 0; i < freq; i++) { int position = docPosEnum.nextPosition(); int start = docPosEnum.startOffset(); int end = docPosEnum.endOffset(); if (LOG.isTraceEnabled()) LOG.trace("Found match " + position + " at docId " + docId + ", field " + field + " start=" + start + ", end=" + end); CoordinateStorage coordinateStorage = idToCoordinateStorageMap.get(docId); int imageIndex = coordinateStorage.getImageIndex(start); int pageIndex = coordinateStorage.getPageIndex(start); HighlightTerm highlightTerm = new HighlightTerm(docId, field, start, end, imageIndex, pageIndex); highlightTerm.setWeight(this.weigh(term)); if (highlightTerm.getWeight() > 0) highlightTerms.add(highlightTerm); } } relativeDocId = docPosEnum.nextDoc(); } } // next term } // next field } // next index leaf to search return termMap; } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.sindicetech.siren.search.node.NodeNumericRangeQuery.java
License:Open Source License
@Override @SuppressWarnings("unchecked") protected TermsEnum getTermsEnum(final Terms terms, final AttributeSource atts) throws IOException { // very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are return (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) ? TermsEnum.EMPTY : new NumericRangeTermsEnum(terms.iterator(null)); }
From source file:com.sindicetech.siren.search.node.NodeTermCollectingRewrite.java
License:Open Source License
final void collectTerms(final IndexReader reader, final MultiNodeTermQuery query, final TermCollector collector) throws IOException { final IndexReaderContext topReaderContext = reader.getContext(); Comparator<BytesRef> lastTermComp = null; for (final AtomicReaderContext context : topReaderContext.leaves()) { final Fields fields = context.reader().fields(); if (fields == null) { // reader has no fields continue; }/* w w w. ja v a 2 s . c o m*/ final Terms terms = fields.terms(query.field); if (terms == null) { // field does not exist continue; } final TermsEnum termsEnum = this.getTermsEnum(query, terms, collector.attributes); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; // Check comparator compatibility: final Comparator<BytesRef> newTermComp = termsEnum.getComparator(); if (lastTermComp != null && newTermComp != null && newTermComp != lastTermComp) throw new RuntimeException("term comparator should not change between segments: " + lastTermComp + " != " + newTermComp); lastTermComp = newTermComp; collector.setReaderContext(topReaderContext, context); collector.setNextEnum(termsEnum); BytesRef bytes; while ((bytes = termsEnum.next()) != null) { if (!collector.collect(bytes)) return; // interrupt whole term collection, so also don't iterate other subReaders } } }
From source file:com.sindicetech.siren.search.node.NodeTermRangeQuery.java
License:Open Source License
@Override protected TermsEnum getTermsEnum(final Terms terms, final AttributeSource atts) throws IOException { if (lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) { return TermsEnum.EMPTY; }/*from ww w. j a v a2 s .c o m*/ final TermsEnum tenum = terms.iterator(null); if ((lowerTerm == null || (includeLower && lowerTerm.length == 0)) && upperTerm == null) { return tenum; } return new TermRangeTermsEnum(tenum, lowerTerm, upperTerm, includeLower, includeUpper); }
From source file:de.unihildesheim.iw.cli.DumpCommonTerms.java
License:Open Source License
/** * Class setup.//from w w w . j a v a 2s.c om * * @param args Commandline arguments. * @throws IOException Thrown on low-level i/o-errors */ @SuppressWarnings("ObjectAllocationInLoop") @SuppressFBWarnings("SEC_SIDE_EFFECT_CONSTRUCTOR") private void runMain(final String... args) throws IOException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); LOG.info("Writing terms to '{}'.", this.cliParams.targetFile); assert this.cliParams.lang != null; assert this.cliParams.stopFilePattern != null; final Set<String> sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat, this.cliParams.stopFilePattern); assert this.cliParams.idxReader != null; final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return; } final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); assert this.cliParams.targetFile != null; try (CSVWriter csvWriter = new CSVWriter(new OutputStreamWriter( new FileOutputStream(this.cliParams.targetFile), StandardCharsets.UTF_8))) { // write header line csvWriter.writeNext(new String[] { "term", "relDF" }); while (term != null) { final String termStr = term.utf8ToString(); if (!sWords.contains(termStr.toLowerCase())) { final double docFreq = (double) termsEnum.docFreq(); if (docFreq > 0d) { final double relDocFreq = docFreq / (double) maxDoc; if (relDocFreq > this.cliParams.threshold) { // log term csvWriter.writeNext(new String[] { termStr, // make exponential string R compatible Double.toString(relDocFreq).toLowerCase() }); } } } term = termsEnum.next(); } } } }
From source file:de.unihildesheim.iw.cli.DumpIPCs.java
License:Open Source License
private void runMain(final String... args) throws IOException, BuildException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); assert this.cliParams.idxReader != null; final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return;// ww w.jav a 2 s.c om } final Parser ipcParser = new Parser(); ipcParser.separatorChar(this.cliParams.sep); ipcParser.allowZeroPad(this.cliParams.zeroPad); final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath())); final Builder idxReaderBuilder = new Builder(reader); Pattern rx_ipc = null; if (this.cliParams.ipc != null) { final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc); final BooleanQuery bq = new BooleanQuery(); rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep)); if (LOG.isDebugEnabled()) { LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc); } bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST); bq.add(new QueryWrapperFilter( new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST); idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq)); } final IndexReader idxReader = idxReaderBuilder.build(); if (idxReader.numDocs() > 0) { final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); final int[] count = { 0, 0 }; // match, exclude while (term != null) { final String code = term.utf8ToString(); if (rx_ipc == null || (rx_ipc.matcher(code).matches())) { final IPCRecord record = ipcParser.parse(code); try { System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '[' + record.toRegExpString('-') + ']'); } catch (final IllegalArgumentException e) { System.out.println(code + ' ' + "INVALID (" + code + ')'); } count[0]++; } else { if (LOG.isDebugEnabled()) { LOG.debug("Skip non matching IPC: {}", code); } count[1]++; } term = termsEnum.next(); } LOG.info("match={} skip={}", count[0], count[1]); } } else { LOG.info("No documents left after filtering."); } }
From source file:de.unihildesheim.iw.cli.DumpTermData.java
License:Open Source License
/** * Class setup.//from w w w . ja v a 2s .c o m * * @param args Commandline arguments. * @throws IOException Thrown on low-level i/o-errors * @throws ClassNotFoundException Thrown if JDBC driver could not be loaded * @throws SQLException Thrown, if connection to the database has failed * @throws BuildException Thrown, if building a {@link * FilteredDirectoryReader} instance has failed */ @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") private void runMain(final String... args) throws IOException, SQLException, ClassNotFoundException, BuildException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); LOG.info("Writing term-data to '{}'.", this.cliParams.dbFile); // table manager instance: Target database for term data try (final TermDataDB db = new TermDataDB(this.cliParams.dbFile)) { // create meta & data table final Table termsTable; if (this.cliParams.ipcRec == null) { termsTable = new TermsTable(); } else { termsTable = new TermsTable( // include optional IPC field TermsTable.FieldsOptional.IPC); } final Table metaTable = new MetaTable(); db.createTables(termsTable, metaTable); try (final TermsTable.Writer dataWriter = new TermsTable.Writer(db.getConnection())) { // write meta-data try (final MetaTable.Writer metaWriter = new MetaTable.Writer(db.getConnection())) { metaWriter.addContent(new TableFieldContent(metaTable) .setValue(MetaTable.Fields.TABLE_NAME, termsTable.getName()) .setValue(MetaTable.Fields.CMD, StringUtils.join(args, " "))); } final Set<String> sWords; if (this.cliParams.stopFilePattern != null) { sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat, this.cliParams.stopFilePattern); } else { sWords = Collections.emptySet(); } final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return; } final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); final AtomicLong count = new AtomicLong(0L); @SuppressWarnings("AnonymousInnerClassMayBeStatic") final TaskObserver obs = new TaskObserver(new TaskObserverMessage() { @Override public void call(@NotNull final TimeMeasure tm) { LOG.info("Collected {} terms after {}.", NumberFormat.getIntegerInstance().format(count.get()), tm.getTimeString()); } }).start(); // normalize some parameters final String fieldName = StringUtils.lowerCase(this.cliParams.field); final String langName = StringUtils.lowerCase(this.cliParams.lang); while (term != null) { final String termStr = term.utf8ToString(); if (!sWords.contains(termStr.toLowerCase())) { final double docFreq = (double) termsEnum.docFreq(); if (docFreq > 0d) { final double relDocFreq = docFreq / (double) maxDoc; if (relDocFreq > this.cliParams.threshold) { @SuppressWarnings("ObjectAllocationInLoop") final TableFieldContent tfc = new TableFieldContent(termsTable); tfc.setValue(TermsTable.Fields.TERM, termStr); tfc.setValue(TermsTable.Fields.DOCFREQ_REL, relDocFreq); tfc.setValue(TermsTable.Fields.DOCFREQ_ABS, docFreq); tfc.setValue(TermsTable.Fields.LANG, langName); tfc.setValue(TermsTable.Fields.FIELD, fieldName); if (this.cliParams.ipcRec != null) { tfc.setValue(TermsTable.FieldsOptional.IPC, this.cliParams.ipcRec.toFormattedString()); } dataWriter.addContent(tfc, false); count.incrementAndGet(); } } } term = termsEnum.next(); } obs.stop(); LOG.info("Total of {} terms collected.", NumberFormat.getIntegerInstance().format(count)); } } } }