Example usage for org.apache.lucene.index TermsEnum EMPTY

List of usage examples for org.apache.lucene.index TermsEnum EMPTY

Introduction

In this page you can find the example usage for org.apache.lucene.index TermsEnum EMPTY.

Prototype

TermsEnum EMPTY

To view the source code for org.apache.lucene.index TermsEnum EMPTY.

Click Source Link

Document

An empty TermsEnum for quickly returning an empty instance e.g.

Usage

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap,
        String id, String field, double weight) throws IOException {
    Query query = new TermQuery(new Term("id", id));
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;//from  ww  w .  j  a va2s .c  o m
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                double minFreq = reader.numDocs() * 0.0001;
                double maxFreq = reader.numDocs() / 3;
                //double minFreq = 0;
                //double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id,
        String field, double weight) throws IOException {
    Query query = NumericRangeQuery.newLongRange("id", id, id, true, true);
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;/*from   w  ww  .jav a2  s .com*/
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                //double minFreq = reader.numDocs() * 0.0001;
                //double maxFreq = reader.numDocs() / 3;
                double minFreq = 0;
                double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION));

    CommandLine cmdline = null;/*from  w ww . ja v  a2  s.  co m*/
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
    int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
    TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);

    long missingCnt = 0;
    int skippedTerms = 0;
    BytesRef bytes = new BytesRef();
    while ((bytes = termsEnum.next()) != null) {
        byte[] buf = new byte[bytes.length];
        System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
        String term = new String(buf, "UTF-8");
        int df = termsEnum.docFreq();
        long cf = termsEnum.totalTermFreq();

        if (df < min) {
            skippedTerms++;
            missingCnt += cf;
            continue;
        }

        out.println(term + "\t" + df + "\t" + cf);
    }

    reader.close();
    out.close();
    System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
}

From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java

License:Open Source License

public Map<Integer, Set<HighlightTerm>> highlight(Set<Integer> docIds, Set<String> fields) {
    try {//w ww  . java  2  s .  com
        Map<Integer, Set<HighlightTerm>> termMap = new HashMap<Integer, Set<HighlightTerm>>();
        Map<Integer, Document> idToDocMap = new HashMap<Integer, Document>();
        Map<Integer, CoordinateStorage> idToCoordinateStorageMap = new HashMap<Integer, CoordinateStorage>();

        Map<Integer, Set<Integer>> myLeaves = new HashMap<Integer, Set<Integer>>();
        for (int docId : docIds) {
            Document luceneDoc = indexSearcher.doc(docId);
            idToDocMap.put(docId, luceneDoc);
            JochreIndexDocument jochreDoc = searchService.getJochreIndexDocument(indexSearcher, docId);
            idToCoordinateStorageMap.put(docId, jochreDoc.getCoordinateStorage());
            termMap.put(docId, new TreeSet<HighlightTerm>());
            int leaf = ReaderUtil.subIndex(docId, leaves);
            Set<Integer> docsPerLeaf = myLeaves.get(leaf);
            if (docsPerLeaf == null) {
                docsPerLeaf = new HashSet<Integer>();
                myLeaves.put(leaf, docsPerLeaf);
            }
            docsPerLeaf.add(docId);
        }

        for (int leaf : myLeaves.keySet()) {
            if (LOG.isTraceEnabled())
                LOG.trace("Searching leaf " + leaf);
            Set<Integer> docsPerLeaf = myLeaves.get(leaf);
            AtomicReaderContext subContext = leaves.get(leaf);
            AtomicReader atomicReader = subContext.reader();

            int fieldCounter = 0;
            for (String field : fields) {
                fieldCounter++;
                if (LOG.isTraceEnabled())
                    LOG.trace("Field " + fieldCounter + ": " + field);

                Terms atomicReaderTerms = atomicReader.terms(field);
                if (atomicReaderTerms == null) {
                    continue; // nothing to do
                }
                TermsEnum termsEnum = atomicReaderTerms.iterator(TermsEnum.EMPTY);

                int termCounter = 0;
                for (BytesRef term : terms) {
                    termCounter++;
                    if (LOG.isTraceEnabled())
                        LOG.trace("Searching for term " + termCounter + ": " + term.utf8ToString()
                                + " in field " + field);

                    if (!termsEnum.seekExact(term)) {
                        continue; // term not found
                    }

                    DocsAndPositionsEnum docPosEnum = termsEnum.docsAndPositions(null, null,
                            DocsAndPositionsEnum.FLAG_OFFSETS);
                    int relativeDocId = docPosEnum.nextDoc();
                    while (relativeDocId != DocsAndPositionsEnum.NO_MORE_DOCS) {
                        int docId = subContext.docBase + relativeDocId;
                        if (docsPerLeaf.contains(docId)) {
                            Document doc = idToDocMap.get(docId);
                            Set<HighlightTerm> highlightTerms = termMap.get(docId);
                            //Retrieve the term frequency in the current document
                            int freq = docPosEnum.freq();
                            if (LOG.isTraceEnabled()) {
                                String extId = doc.get("id");
                                String path = doc.get("path");
                                LOG.trace("Found " + freq + " matches for doc " + docId + ", extId: " + extId
                                        + ", path: " + path);
                            }

                            for (int i = 0; i < freq; i++) {
                                int position = docPosEnum.nextPosition();
                                int start = docPosEnum.startOffset();
                                int end = docPosEnum.endOffset();

                                if (LOG.isTraceEnabled())
                                    LOG.trace("Found match " + position + " at docId " + docId + ", field "
                                            + field + " start=" + start + ", end=" + end);

                                CoordinateStorage coordinateStorage = idToCoordinateStorageMap.get(docId);
                                int imageIndex = coordinateStorage.getImageIndex(start);
                                int pageIndex = coordinateStorage.getPageIndex(start);

                                HighlightTerm highlightTerm = new HighlightTerm(docId, field, start, end,
                                        imageIndex, pageIndex);
                                highlightTerm.setWeight(this.weigh(term));
                                if (highlightTerm.getWeight() > 0)
                                    highlightTerms.add(highlightTerm);
                            }
                        }
                        relativeDocId = docPosEnum.nextDoc();
                    }
                } // next term
            } // next field
        } // next index leaf to search

        return termMap;
    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.sindicetech.siren.search.node.NodeNumericRangeQuery.java

License:Open Source License

@Override
@SuppressWarnings("unchecked")
protected TermsEnum getTermsEnum(final Terms terms, final AttributeSource atts) throws IOException {
    // very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are
    return (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) ? TermsEnum.EMPTY
            : new NumericRangeTermsEnum(terms.iterator(null));
}

From source file:com.sindicetech.siren.search.node.NodeTermCollectingRewrite.java

License:Open Source License

final void collectTerms(final IndexReader reader, final MultiNodeTermQuery query, final TermCollector collector)
        throws IOException {
    final IndexReaderContext topReaderContext = reader.getContext();
    Comparator<BytesRef> lastTermComp = null;
    for (final AtomicReaderContext context : topReaderContext.leaves()) {
        final Fields fields = context.reader().fields();
        if (fields == null) {
            // reader has no fields
            continue;
        }/* w w w.  ja  v a  2  s .  c  o m*/

        final Terms terms = fields.terms(query.field);
        if (terms == null) {
            // field does not exist
            continue;
        }

        final TermsEnum termsEnum = this.getTermsEnum(query, terms, collector.attributes);
        assert termsEnum != null;

        if (termsEnum == TermsEnum.EMPTY)
            continue;

        // Check comparator compatibility:
        final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
        if (lastTermComp != null && newTermComp != null && newTermComp != lastTermComp)
            throw new RuntimeException("term comparator should not change between segments: " + lastTermComp
                    + " != " + newTermComp);
        lastTermComp = newTermComp;
        collector.setReaderContext(topReaderContext, context);
        collector.setNextEnum(termsEnum);
        BytesRef bytes;
        while ((bytes = termsEnum.next()) != null) {
            if (!collector.collect(bytes))
                return; // interrupt whole term collection, so also don't iterate other subReaders
        }
    }
}

From source file:com.sindicetech.siren.search.node.NodeTermRangeQuery.java

License:Open Source License

@Override
protected TermsEnum getTermsEnum(final Terms terms, final AttributeSource atts) throws IOException {
    if (lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) {
        return TermsEnum.EMPTY;
    }/*from   ww w.  j a v a2 s  .c o  m*/

    final TermsEnum tenum = terms.iterator(null);

    if ((lowerTerm == null || (includeLower && lowerTerm.length == 0)) && upperTerm == null) {
        return tenum;
    }
    return new TermRangeTermsEnum(tenum, lowerTerm, upperTerm, includeLower, includeUpper);
}

From source file:de.unihildesheim.iw.cli.DumpCommonTerms.java

License:Open Source License

/**
 * Class setup.//from   w  w  w  .  j a v a  2s.c om
 *
 * @param args Commandline arguments.
 * @throws IOException Thrown on low-level i/o-errors
 */
@SuppressWarnings("ObjectAllocationInLoop")
@SuppressFBWarnings("SEC_SIDE_EFFECT_CONSTRUCTOR")
private void runMain(final String... args) throws IOException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    LOG.info("Writing terms to '{}'.", this.cliParams.targetFile);

    assert this.cliParams.lang != null;
    assert this.cliParams.stopFilePattern != null;
    final Set<String> sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat,
            this.cliParams.stopFilePattern);

    assert this.cliParams.idxReader != null;
    final int maxDoc = this.cliParams.idxReader.maxDoc();
    if (maxDoc == 0) {
        LOG.error("Empty index.");
        return;
    }

    final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field);
    TermsEnum termsEnum = TermsEnum.EMPTY;
    BytesRef term;

    if (terms != null) {
        termsEnum = terms.iterator(termsEnum);
        term = termsEnum.next();

        assert this.cliParams.targetFile != null;
        try (CSVWriter csvWriter = new CSVWriter(new OutputStreamWriter(
                new FileOutputStream(this.cliParams.targetFile), StandardCharsets.UTF_8))) {

            // write header line
            csvWriter.writeNext(new String[] { "term", "relDF" });

            while (term != null) {
                final String termStr = term.utf8ToString();
                if (!sWords.contains(termStr.toLowerCase())) {
                    final double docFreq = (double) termsEnum.docFreq();
                    if (docFreq > 0d) {
                        final double relDocFreq = docFreq / (double) maxDoc;

                        if (relDocFreq > this.cliParams.threshold) {
                            // log term
                            csvWriter.writeNext(new String[] { termStr,
                                    // make exponential string R compatible
                                    Double.toString(relDocFreq).toLowerCase() });
                        }
                    }
                }
                term = termsEnum.next();
            }
        }
    }
}

From source file:de.unihildesheim.iw.cli.DumpIPCs.java

License:Open Source License

private void runMain(final String... args) throws IOException, BuildException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    assert this.cliParams.idxReader != null;
    final int maxDoc = this.cliParams.idxReader.maxDoc();
    if (maxDoc == 0) {
        LOG.error("Empty index.");
        return;// ww w.jav a  2  s.c om
    }

    final Parser ipcParser = new Parser();
    ipcParser.separatorChar(this.cliParams.sep);
    ipcParser.allowZeroPad(this.cliParams.zeroPad);

    final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath()));
    final Builder idxReaderBuilder = new Builder(reader);

    Pattern rx_ipc = null;

    if (this.cliParams.ipc != null) {
        final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc);
        final BooleanQuery bq = new BooleanQuery();
        rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep));
        if (LOG.isDebugEnabled()) {
            LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc);
        }

        bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST);
        bq.add(new QueryWrapperFilter(
                new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST);
        idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq));
    }

    final IndexReader idxReader = idxReaderBuilder.build();

    if (idxReader.numDocs() > 0) {
        final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC);
        TermsEnum termsEnum = TermsEnum.EMPTY;
        BytesRef term;
        if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            term = termsEnum.next();

            final int[] count = { 0, 0 }; // match, exclude
            while (term != null) {
                final String code = term.utf8ToString();
                if (rx_ipc == null || (rx_ipc.matcher(code).matches())) {
                    final IPCRecord record = ipcParser.parse(code);
                    try {
                        System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '['
                                + record.toRegExpString('-') + ']');
                    } catch (final IllegalArgumentException e) {
                        System.out.println(code + ' ' + "INVALID (" + code + ')');
                    }
                    count[0]++;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Skip non matching IPC: {}", code);
                    }
                    count[1]++;
                }
                term = termsEnum.next();
            }
            LOG.info("match={} skip={}", count[0], count[1]);
        }
    } else {
        LOG.info("No documents left after filtering.");
    }
}

From source file:de.unihildesheim.iw.cli.DumpTermData.java

License:Open Source License

/**
 * Class setup.//from  w  w w  . ja  v  a 2s .c o  m
 *
 * @param args Commandline arguments.
 * @throws IOException Thrown on low-level i/o-errors
 * @throws ClassNotFoundException Thrown if JDBC driver could not be loaded
 * @throws SQLException Thrown, if connection to the database has failed
 * @throws BuildException Thrown, if building a {@link
 * FilteredDirectoryReader} instance has failed
 */
@SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
private void runMain(final String... args)
        throws IOException, SQLException, ClassNotFoundException, BuildException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    LOG.info("Writing term-data to '{}'.", this.cliParams.dbFile);

    // table manager instance: Target database for term data
    try (final TermDataDB db = new TermDataDB(this.cliParams.dbFile)) {
        // create meta & data table
        final Table termsTable;
        if (this.cliParams.ipcRec == null) {
            termsTable = new TermsTable();
        } else {
            termsTable = new TermsTable(
                    // include optional IPC field
                    TermsTable.FieldsOptional.IPC);
        }
        final Table metaTable = new MetaTable();
        db.createTables(termsTable, metaTable);

        try (final TermsTable.Writer dataWriter = new TermsTable.Writer(db.getConnection())) {

            // write meta-data
            try (final MetaTable.Writer metaWriter = new MetaTable.Writer(db.getConnection())) {
                metaWriter.addContent(new TableFieldContent(metaTable)
                        .setValue(MetaTable.Fields.TABLE_NAME, termsTable.getName())
                        .setValue(MetaTable.Fields.CMD, StringUtils.join(args, " ")));
            }

            final Set<String> sWords;
            if (this.cliParams.stopFilePattern != null) {
                sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat,
                        this.cliParams.stopFilePattern);
            } else {
                sWords = Collections.emptySet();
            }

            final int maxDoc = this.cliParams.idxReader.maxDoc();
            if (maxDoc == 0) {
                LOG.error("Empty index.");
                return;
            }

            final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field);
            TermsEnum termsEnum = TermsEnum.EMPTY;
            BytesRef term;

            if (terms != null) {
                termsEnum = terms.iterator(termsEnum);
                term = termsEnum.next();
                final AtomicLong count = new AtomicLong(0L);

                @SuppressWarnings("AnonymousInnerClassMayBeStatic")
                final TaskObserver obs = new TaskObserver(new TaskObserverMessage() {
                    @Override
                    public void call(@NotNull final TimeMeasure tm) {
                        LOG.info("Collected {} terms after {}.",
                                NumberFormat.getIntegerInstance().format(count.get()), tm.getTimeString());
                    }
                }).start();

                // normalize some parameters
                final String fieldName = StringUtils.lowerCase(this.cliParams.field);
                final String langName = StringUtils.lowerCase(this.cliParams.lang);

                while (term != null) {
                    final String termStr = term.utf8ToString();
                    if (!sWords.contains(termStr.toLowerCase())) {
                        final double docFreq = (double) termsEnum.docFreq();
                        if (docFreq > 0d) {
                            final double relDocFreq = docFreq / (double) maxDoc;

                            if (relDocFreq > this.cliParams.threshold) {
                                @SuppressWarnings("ObjectAllocationInLoop")
                                final TableFieldContent tfc = new TableFieldContent(termsTable);
                                tfc.setValue(TermsTable.Fields.TERM, termStr);
                                tfc.setValue(TermsTable.Fields.DOCFREQ_REL, relDocFreq);
                                tfc.setValue(TermsTable.Fields.DOCFREQ_ABS, docFreq);
                                tfc.setValue(TermsTable.Fields.LANG, langName);
                                tfc.setValue(TermsTable.Fields.FIELD, fieldName);
                                if (this.cliParams.ipcRec != null) {
                                    tfc.setValue(TermsTable.FieldsOptional.IPC,
                                            this.cliParams.ipcRec.toFormattedString());
                                }
                                dataWriter.addContent(tfc, false);
                                count.incrementAndGet();
                            }
                        }
                    }
                    term = termsEnum.next();
                }
                obs.stop();
                LOG.info("Total of {} terms collected.", NumberFormat.getIntegerInstance().format(count));
            }
        }
    }
}