Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java

License:Apache License

@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_LUCENE_DIR, tmpDir);

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }//from  w w  w.j  a va  2s  .co m

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    // if there were multiple instances of the same ngram,
                    // then this would be relevant
                    if (text.utf8ToString().equals("mice_ANDcats_.")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(1, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(65, i);
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java

License:Apache License

@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory
            .createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_LUCENE_DIR, tmpDir);

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }/*w w  w  .j  av a2  s.  c  o  m*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(16, i);
}

From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java

License:Open Source License

@SuppressFBWarnings({ "EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS", "EXS_EXCEPTION_SOFTENING_NO_CHECKED" })
@Override/* w  w w.  j  a  va2  s.  c om*/
public Stream<BytesRef> getDocumentTerms(final int docId, @NotNull final String... field) {
    Arrays.sort(field);
    final Fields fields;
    try {
        fields = this.index.reader.getTermVectors(docId);
    } catch (final IOException e) {
        throw new UncheckedIOException(e);
    }

    if (fields == null) {
        return Stream.empty();
    }

    final BytesRefHash terms = new BytesRefHash();
    StreamSupport.stream(fields.spliterator(), false)
            // filter for required fields
            .filter(fn -> Arrays.binarySearch(field, fn) >= 0).map(fn -> {
                try {
                    return fields.terms(fn);
                } catch (final IOException e) {
                    throw new UncheckedIOException(e);
                }
            }).filter(t -> t != null).forEach(t -> {
                try {
                    final TermsEnum te = t.iterator(null);
                    BytesRef term;
                    while ((term = te.next()) != null) {
                        terms.add(term);
                    }
                } catch (final IOException e) {
                    throw new UncheckedIOException(e);
                }
            });

    return StreamUtils.stream(terms);
}

From source file:de.unihildesheim.iw.lucene.index.FilteredDirectoryReaderTest.java

License:Open Source License

@Test
public void testTermsEnum_totalTermFreq() throws Exception {
    try (TestMemIndex idx = new TestMemIndex(Index.ALL_FIELDS)) {
        final DirectoryReader reader = DirectoryReader.open(idx.dir);
        final FilteredDirectoryReader fReader = new Builder(reader).fields(Collections.singleton("f2")).build();

        fReader.getSubReaders().forEach(r -> {
            final Fields f = r.fields();
            f.forEach(fld -> {// w ww .  ja v  a2s .  c  om
                try {
                    final Terms t = f.terms(fld);
                    final TermsEnum te = t.iterator(null);
                    while (te.next() != null) {
                        te.totalTermFreq();
                    }
                } catch (final IOException e) {
                    throw new UncheckedIOException(e);
                }
            });
        });
    }
}

From source file:de.unihildesheim.iw.lucene.query.RelaxableCommonTermsQuery.java

License:Open Source License

/**
 * New instance using settings from the supplied {@link Builder} instance.
 *
 * @param builder {@link Builder} Instance builder
 * @throws IOException Thrown on low-level i/o-errors
 *//*from  w w  w .j a  v a  2 s  .  c om*/
@SuppressWarnings({ "ObjectAllocationInLoop", "ObjectEquality" })
RelaxableCommonTermsQuery(@NotNull final Builder builder) throws IOException {
    // get all query terms
    assert builder.queryStr != null;
    assert builder.analyzer != null;
    this.queryTerms = QueryUtils.tokenizeQueryString(builder.queryStr, builder.analyzer);

    // list of unique terms contained in the query (stopped, analyzed)
    final String[] uniqueQueryTerms = this.queryTerms.stream().distinct().toArray(String[]::new);
    final int uniqueTermsCount = uniqueQueryTerms.length;

    // heavily based on code from org.apache.lucene.queries.CommonTermsQuery
    assert builder.reader != null;
    final List<LeafReaderContext> leaves = builder.reader.leaves();
    final int maxDoc = builder.reader.maxDoc();
    TermsEnum termsEnum = null;
    final List<Query> subQueries = new ArrayList<>(10);

    assert builder.fields != null;
    for (final String field : builder.fields) {
        final TermContext[] tcArray = new TermContext[uniqueTermsCount];
        final BooleanQuery lowFreq = new BooleanQuery();
        final BooleanQuery highFreq = new BooleanQuery();

        // collect term statistics
        for (int i = 0; i < uniqueTermsCount; i++) {
            final Term term = new Term(field, uniqueQueryTerms[i]);
            for (final LeafReaderContext context : leaves) {
                final TermContext termContext = tcArray[i];
                final Fields fields = context.reader().fields();
                final Terms terms = fields.terms(field);
                if (terms != null) {
                    // only, if field exists
                    termsEnum = terms.iterator(termsEnum);
                    if (termsEnum != TermsEnum.EMPTY) {
                        if (termsEnum.seekExact(term.bytes())) {
                            if (termContext == null) {
                                tcArray[i] = new TermContext(builder.reader.getContext(), termsEnum.termState(),
                                        context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                            } else {
                                termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(),
                                        termsEnum.totalTermFreq());
                            }
                        }
                    }
                }
            }

            // build query
            if (tcArray[i] == null) {
                lowFreq.add(new TermQuery(term), builder.lowFreqOccur);
            } else {
                if ((builder.maxTermFrequency >= 1f && (float) tcArray[i].docFreq() > builder.maxTermFrequency)
                        || (tcArray[i].docFreq() > (int) Math
                                .ceil((double) (builder.maxTermFrequency * (float) maxDoc)))) {
                    highFreq.add(new TermQuery(term, tcArray[i]), builder.highFreqOccur);
                } else {
                    lowFreq.add(new TermQuery(term, tcArray[i]), builder.lowFreqOccur);
                }
            }

            final int numLowFreqClauses = lowFreq.clauses().size();
            final int numHighFreqClauses = highFreq.clauses().size();
            if (builder.lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
                lowFreq.setMinimumNumberShouldMatch(numLowFreqClauses);
            }
            if (builder.highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
                highFreq.setMinimumNumberShouldMatch(numHighFreqClauses);
            }
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug("qLF={}", lowFreq);
            LOG.debug("qHF={}", highFreq);
        }

        if (lowFreq.clauses().isEmpty()) {
            subQueries.add(highFreq);
        } else if (highFreq.clauses().isEmpty()) {
            subQueries.add(lowFreq);
        } else {
            final BooleanQuery query = new BooleanQuery(true); // final query
            query.add(highFreq, Occur.SHOULD);
            query.add(lowFreq, Occur.MUST);
            subQueries.add(query);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("qList={}", subQueries);
    }

    this.query = subQueries.size() == 1 ? subQueries.get(0) : new DisjunctionMaxQuery(subQueries, 0.1f);

    if (LOG.isDebugEnabled()) {
        LOG.debug("RCTQ {} uQt={}", this.query, uniqueQueryTerms);
    }
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java

License:Open Source License

/**
 *
 * @param reader//from   www . j a  v  a2  s . c o  m
 * @param numTerms
 * @param field
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception {
    TermStatsQueue tiq = null;

    if (field != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            throw new RuntimeException("field " + field + " not found");
        }
        Terms terms = fields.terms(field);
        if (terms != null) {
            TermsEnum termsEnum = terms.iterator(null);
            tiq = new TermStatsQueue(numTerms);
            tiq.fill(field, termsEnum);
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            throw new RuntimeException("no fields found for this index");
        }
        tiq = new TermStatsQueue(numTerms);
        FieldsEnum fieldsEnum = fields.iterator();
        while (true) {
            field = fieldsEnum.next();
            if (field != null) {
                Terms terms = fieldsEnum.terms();
                if (terms != null) {
                    tiq.fill(field, terms.iterator(null));
                }
            } else {
                break;
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:Example.lucene.ReadIndex.java

public static void main(String[] args) throws IOException, ParseException {

    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexing"));

    // 2. query//from   www. j av  a2 s .c  om
    //String querystr = args.length > 0 ? args[0] : "golf user";
    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    //Query q = new MultiFieldQueryParser(Version.LUCENE_45, new String[] {"content"}, analyzer).parse(querystr);
    //IndexReader indexReader = IndexReader.open(path);
    IndexReader reader = DirectoryReader.open(index);
    //IndexSearcher searcher = new IndexSearcher(reader);

    //Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
    //TermsEnum te = terms.iterator(TermsEnum.EMPTY);
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("content");
    TermsEnum iterator = terms.iterator(null);
    BytesRef byteRef;
    while ((byteRef = iterator.next()) != null) {
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        int docFreq = iterator.docFreq();
        System.out.println(term + " " + docFreq);
    }
}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private NamedList<Integer> getTerms(SolrIndexSearcher searcher, IndexSchema schema, String field)
        throws IOException {
    NamedList<Object> termsResult = new SimpleOrderedMap<Object>();

    boolean sort = true;

    boolean raw = false;

    final AtomicReader indexReader = searcher.getAtomicReader();
    Fields lfields = indexReader.fields();

    NamedList<Integer> fieldTerms = new NamedList<Integer>();
    termsResult.add(field, fieldTerms);//from   ww  w.  j  av a2 s.c o  m

    Terms terms = lfields == null ? null : lfields.terms(field);
    if (terms == null) {
        // no terms for this field
        return new NamedList<Integer>();
    }

    FieldType ft = raw ? null : schema.getFieldTypeNoEx(field);
    if (ft == null)
        ft = new StrField();

    TermsEnum termsEnum = terms.iterator(null);
    BytesRef term = null;

    term = termsEnum.next();

    BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort
            ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(Integer.MAX_VALUE)
            : null);
    CharsRef external = new CharsRef();
    while (term != null) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        // add the term to the list
        if (sort) {
            queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq));
        } else {
            // TODO: handle raw somehow
            if (!externalized) {
                ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
        }

        term = termsEnum.next();
    }

    if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
            ft.indexedToReadable(item.key, external);
            fieldTerms.add(external.toString(), item.val);
        }
    }

    return fieldTerms;
}

From source file:indexer.DocVecSequenceFileGenerator.java

long getNumDocs() throws Exception {
    Fields fields = MultiFields.getFields(reader);
    return fields.terms(AMI_FIELDS.FIELD_DOC_NAME).size();
}

From source file:indexer.OptimizedRealValuedVecIndexer.java

void processAllTermWise() throws Exception {
    Cell cell, requantizedCell;//from   w ww .  j  ava2  s .  co m

    copyIndex();
    IndexReader currentReader = DirectoryReader.open(writer, true);

    List<Cell> splitCells = new ArrayList<>();

    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms(DocVector.FIELD_CELL_ID);
    TermsEnum te = terms.iterator();

    // Iterate through every term (a cell docName) and requantize the
    // points within the cell if required.
    while (te.next() != null) {
        String cellId = te.term().utf8ToString();
        cell = new Cell(cellId);

        if (cell.toSplit(reader)) {
            splitCells.add(cell);
            List<DocVector> containedPoints = cell.getVectors(currentReader, terms, numDimensions);

            for (DocVector p : containedPoints) {
                requantizedCell = cell.quantize(p); // this function returns a new object
                p.quantize(requantizedCell); // update quantization info (cell docName)
                Document doc = p.constructDoc();

                Term t = new Term(DocVector.FIELD_ID, p.docName);
                writer.deleteDocuments(t);
                writer.addDocument(doc);
            }
            //Much faster if we don't commit here...
            //writer.commit();
        }
    }

    saveSplitCells(writer, splitCells);

    currentReader.close();
    reader.close();
    writer.close();
}