Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:org.languagetool.dev.bigdata.LargestNGramFinder.java

License:Open Source License

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.out.println("Usage: " + LargestNGramFinder.class.getSimpleName() + " <ngramIndexDir>");
        System.exit(1);//  w  w  w . ja v a  2 s  .c  o  m
    }
    FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
    IndexReader reader = DirectoryReader.open(fsDir);
    IndexSearcher searcher = new IndexSearcher(reader);
    Fields fields = MultiFields.getFields(reader);
    long max = 0;
    String maxTerm = "";
    Terms terms = fields.terms("ngram");
    TermsEnum termsEnum = terms.iterator();
    int count = 0;
    BytesRef next;
    while ((next = termsEnum.next()) != null) {
        String term = next.utf8ToString();
        TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5);
        int docId = topDocs.scoreDocs[0].doc;
        Document document = reader.document(docId);
        long thisCount = Long.parseLong(document.get("count"));
        if (max < thisCount) {
            max = thisCount;
            maxTerm = term;
        }
        if (count % 10_000 == 0) {
            System.out.println(count + " -> " + topDocs.totalHits + " for " + term + " -> " + thisCount
                    + ", max so far: " + max + " for '" + maxTerm + "'");
        }
        count++;
    }
    System.out.println("Max: " + max + " for " + maxTerm);
}

From source file:org.languagetool.dev.bigdata.NeededNGramCounter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.out.println("Usage: " + NeededNGramCounter.class.getSimpleName() + " <ngramIndexDir>");
        System.exit(1);//ww w  . jav a 2  s  .c  om
    }
    Language lang = Languages.getLanguageForShortCode(LANG);
    String path = "/" + lang.getShortCode() + "/confusion_sets.txt";
    Set<String> ngrams;
    try (InputStream confSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path)) {
        ngrams = new ConfusionSetLoader().loadConfusionSet(confSetStream).keySet();
    }
    String ngramIndexDir = args[0];
    FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath());
    IndexReader reader = DirectoryReader.open(fsDir);
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("ngram");
    TermsEnum termsEnum = terms.iterator();
    int i = 0;
    int needed = 0;
    int notNeeded = 0;
    BytesRef next;
    while ((next = termsEnum.next()) != null) {
        String term = next.utf8ToString();
        String[] tmpTerms = term.split(" ");
        boolean ngramNeeded = false;
        for (String tmpTerm : tmpTerms) {
            if (ngrams.contains(tmpTerm)) {
                ngramNeeded = true;
                break;
            }
        }
        if (ngramNeeded) {
            //System.out.println("needed: " + term);
            needed++;
        } else {
            //System.out.println("not needed: " + term);
            notNeeded++;
        }
        if (i % 500_000 == 0) {
            System.out.println(i + "/" + terms.getDocCount());
        }
        i++;
    }
    System.out.println("language         : " + LANG);
    System.out.println("ngram index      : " + ngramIndexDir);
    System.out.println("needed ngrams    : " + needed);
    System.out.println("not needed ngrams: " + notNeeded);
}

From source file:org.languagetool.dev.HomophoneOccurrenceDumper.java

License:Open Source License

private TermsEnum getIterator() throws IOException {
    LuceneSearcher luceneSearcher = getLuceneSearcher(3);
    Fields fields = MultiFields.getFields(luceneSearcher.getReader());
    Terms terms = fields.terms("ngram");
    return terms.iterator(null);
}

From source file:org.lexevs.dao.index.lucene.v2010.metadata.LuceneMetadataDao.java

License:Open Source License

@Override
public AbsoluteCodingSchemeVersionReferenceList listCodingSchemes() {

    AbsoluteCodingSchemeVersionReferenceList result = new AbsoluteCodingSchemeVersionReferenceList();

    try {/*w w  w .j  a v a  2s.  co  m*/
        final TermsEnum te = luceneIndexTemplate.executeInIndexReader(new IndexReaderCallback<TermsEnum>() {

            @Override
            public TermsEnum doInIndexReader(IndexReader indexReader) throws Exception {
                TermsEnum termsEnum = null;
                Fields fields = MultiFields.getFields(indexReader);
                if (fields != null) {
                    Terms terms = fields.terms("codingSchemeNameVersion");
                    if (terms != null) {

                        termsEnum = terms.iterator();
                    }
                }

                return termsEnum;
            }
        });

        // TODO see Multifield for a better implementation of this.
        BytesRef text = null;
        while ((te != null) && (text = te.next()) != null) {
            Query temp = new TermQuery(new Term("codingSchemeNameVersion", text.utf8ToString()));

            List<ScoreDoc> d = this.luceneIndexTemplate.search(temp, null);
            if (d.size() > 0) {

                ScoreDoc doc = d.get(0);
                AbsoluteCodingSchemeVersionReference acsvr = new AbsoluteCodingSchemeVersionReference();

                Document document = luceneIndexTemplate.getDocumentById(doc.doc);
                acsvr.setCodingSchemeURN(document.get("codingSchemeRegisteredName"));
                acsvr.setCodingSchemeVersion(document.get("codingSchemeVersion"));

                result.addAbsoluteCodingSchemeVersionReference(acsvr);
            }

        }

        return result;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:org.meresco.lucene.Lucene.java

License:Open Source License

public LuceneResponse similarDocuments(String identifier) throws Throwable {
    SearcherAndTaxonomy reference = data.getManager().acquire();
    try {//from ww w  . j  a v  a2  s .  c o m
        Query idQuery = new TermQuery(new Term(ID_FIELD, identifier));
        TopDocs topDocs = reference.searcher.search(idQuery, 1);
        if (topDocs.totalHits == 0)
            return new LuceneResponse(0);
        int docId = topDocs.scoreDocs[0].doc;
        IndexReader reader = reference.searcher.getIndexReader();
        CommonTermsQuery commonQuery = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, 0.1f);
        Fields termVectors = reader.getTermVectors(docId);
        if (termVectors == null)
            return new LuceneResponse(0);
        for (String field : termVectors) {
            TermsEnum iterator = termVectors.terms(field).iterator(null);
            BytesRef b;
            while ((b = iterator.next()) != null) {
                Term term = new Term(field, b.utf8ToString());
                commonQuery.add(term);
            }
        }
        BooleanQuery query = new BooleanQuery();
        query.add(idQuery, Occur.MUST_NOT);
        query.add(commonQuery, Occur.MUST);
        return executeQuery(query);
    } finally {
        data.getManager().release(reference);
    }
}

From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java

License:Open Source License

@Test
public void testIndexReleaseGroupNumReleases() throws Exception {

    addReleaseGroupOne();//w ww.  j  a  v a2s.co  m
    RAMDirectory ramDir = new RAMDirectory();
    createIndex(ramDir);

    IndexReader ir = DirectoryReader.open(ramDir);
    assertEquals(2, ir.numDocs());
    {
        Fields fields = MultiFields.getFields(ir);
        Terms terms = fields.terms(ReleaseGroupIndexField.NUM_RELEASES.getName());
        TermsEnum termsEnum = terms.iterator(null);
        termsEnum.next();
        assertEquals(1, NumericUtils.prefixCodedToInt(termsEnum.term()));
    }
}

From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java

License:Open Source License

@Test
public void testIndexReleaseGroupMultipleArtists() throws Exception {

    addReleaseGroupFour();/*from  w w  w  .java  2 s.  c o  m*/
    RAMDirectory ramDir = new RAMDirectory();
    createIndex(ramDir);

    IndexReader ir = DirectoryReader.open(ramDir);
    assertEquals(2, ir.numDocs());
    {
        Document doc = ir.document(1);

        Fields fields = MultiFields.getFields(ir);
        Terms terms = fields.terms(ReleaseGroupIndexField.ARTIST_NAME.getName());
        TermsEnum tr = terms.iterator(null);
        tr.next();
        assertEquals(1, tr.docFreq());
        assertEquals("cincinnati", tr.term().utf8ToString());
        tr.next();
        assertEquals("erich", tr.term().utf8ToString());
        tr.next();
        assertEquals("kunzel", tr.term().utf8ToString());
        tr.next();
        assertEquals("kunzstel", tr.term().utf8ToString());
        tr.next();
        assertEquals("kunzstelein", tr.term().utf8ToString());
        tr.next();
        assertEquals("orchestra", tr.term().utf8ToString());
        tr.next();
        assertEquals("pops", tr.term().utf8ToString());
        tr.next();
        assertEquals("the", tr.term().utf8ToString());

        terms = fields.terms(ReleaseGroupIndexField.ARTIST_ID.getName());
        tr = terms.iterator(null);
        tr.next();
        assertEquals(1, tr.docFreq());
        assertEquals("99845d0c-f239-4051-a6b1-4b5e9f7ede0b", tr.term().utf8ToString());
        tr.next();
        assertEquals("d8fbd94c-cd06-4e8b-a559-761ad969d07e", tr.term().utf8ToString());
        tr.next();

        terms = fields.terms(ReleaseGroupIndexField.ARTIST_NAMECREDIT.getName());
        tr = terms.iterator(null);
        tr.next();
        assertEquals(1, tr.docFreq());
        assertEquals("cincinnati", tr.term().utf8ToString());
        tr.next();
        assertEquals("erich", tr.term().utf8ToString());
        tr.next();
        assertEquals("kunzel", tr.term().utf8ToString());
        tr.next();
        assertEquals("pops", tr.term().utf8ToString());

        assertEquals("Epics", doc.getFields(ReleaseGroupIndexField.RELEASEGROUP.getName())[0].stringValue());
        assertEquals("efd2ace2-b3b9-305f-8a53-9803595c0e37",
                doc.getFields(ReleaseGroupIndexField.RELEASEGROUP_ID.getName())[0].stringValue());

        ArtistCredit ac = ArtistCreditHelper
                .unserialize(doc.get(ReleaseGroupIndexField.ARTIST_CREDIT.getName()));
        assertNotNull(ac);
        assertEquals("Erich Kunzel", ac.getNameCredit().get(0).getArtist().getName());
        assertEquals("Cincinnati Pops", ac.getNameCredit().get(1).getName());
        assertEquals("The Cincinnati Pops Orchestra", ac.getNameCredit().get(1).getArtist().getName());
        assertEquals(2, ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().size());
        assertNull(ac.getNameCredit().get(1).getArtist().getAliasList());
        assertEquals("Erich Kunzstel",
                ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(0).getContent());
        assertEquals("Erich Kunzstelein",
                ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(1).getContent());
        assertEquals("Kunzstel, Erich",
                ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(0).getSortName());
        assertEquals("Kunzstelein, Erich",
                ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(1).getSortName());
        assertEquals("en", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(0).getLocale());
        assertEquals("de", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(1).getLocale());

    }
    ir.close();

}

From source file:org.neo4j.kernel.api.impl.schema.verification.PartitionedUniquenessVerifier.java

License:Open Source License

private Terms termsForField(String fieldName) throws IOException {
    List<Terms> terms = new ArrayList<>();
    List<ReaderSlice> readerSlices = new ArrayList<>();

    for (LeafReader leafReader : allLeafReaders()) {
        Fields fields = leafReader.fields();

        Terms leafTerms = fields.terms(fieldName);
        if (leafTerms != null) {
            ReaderSlice readerSlice = new ReaderSlice(0, Math.toIntExact(leafTerms.size()), 0);
            terms.add(leafTerms);/*from   w w  w.ja  v  a  2s. c  o  m*/
            readerSlices.add(readerSlice);
        }
    }

    Terms[] termsArray = terms.toArray(new Terms[terms.size()]);
    ReaderSlice[] readerSlicesArray = readerSlices.toArray(new ReaderSlice[readerSlices.size()]);

    return new MultiTerms(termsArray, readerSlicesArray);
}

From source file:org.neo4j.kernel.api.impl.schema.verification.SimpleUniquenessVerifier.java

License:Open Source License

@Override
public void verify(PropertyAccessor accessor, int propKeyId) throws IndexEntryConflictException, IOException {
    try {/*from  w  ww .j  a v a  2  s.co m*/
        DuplicateCheckingCollector collector = new DuplicateCheckingCollector(accessor, propKeyId);
        IndexSearcher searcher = indexSearcher();
        for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
            Fields fields = leafReaderContext.reader().fields();
            for (String field : fields) {
                if (LuceneDocumentStructure.NODE_ID_KEY.equals(field)) {
                    continue;
                }

                TermsEnum terms = LuceneDocumentStructure.originalTerms(fields.terms(field), field);
                BytesRef termsRef;
                while ((termsRef = terms.next()) != null) {
                    if (terms.docFreq() > 1) {
                        collector.reset();
                        searcher.search(new TermQuery(new Term(field, termsRef)), collector);
                    }
                }
            }
        }
    } catch (IOException e) {
        Throwable cause = e.getCause();
        if (cause instanceof IndexEntryConflictException) {
            throw (IndexEntryConflictException) cause;
        }
        throw e;
    }
}

From source file:org.nlp4l.lucene.BuddyWordsFinder.java

License:Apache License

public Scorer[] findCoincidentalTerms(String field, BytesRef term) throws IOException {

    baseTermFilter.start(reader, field, term);
    if (baseTermFilter.skip(term) || baseTermFilter.skipByPopularity(term))
        return null;
    //System.out.println(term.utf8ToString());

    Bits liveDocs = MultiFields.getLiveDocs(reader);

    PostingsEnum de = MultiFields.getTermDocsEnum(reader, field, term);
    if (de == null)
        return null;
    int numDocsAnalyzed = 0;
    phraseTerms.clear();/*from   ww  w.  j  a  va  2  s. c o m*/

    while (de.nextDoc() != PostingsEnum.NO_MORE_DOCS && numDocsAnalyzed < maxDocsToAnalyze) {
        int docId = de.docID();

        //first record all of the positions of the term in a bitset which
        // represents terms in the current doc.
        int freq = de.freq();
        PostingsEnum pe = MultiFields.getTermPositionsEnum(reader, field, term);
        int ret = pe.advance(docId);
        if (ret == PostingsEnum.NO_MORE_DOCS)
            continue;
        termPos.clear();
        for (int i = 0; i < freq; i++) {
            int pos = pe.nextPosition();
            if (pos < termPos.size())
                termPos.set(pos);
        }

        // now look at all OTHER terms in this doc and see if they are
        // positioned in a pre-defined sized window around the current term
        Fields vectors = reader.getTermVectors(docId);
        // check it has term vectors
        if (vectors == null)
            return null;
        Terms vector = vectors.terms(field);
        // check it has position info
        if (vector == null || !vector.hasPositions())
            return null;

        TermsEnum te = vector.iterator();
        BytesRef otherTerm = null;
        while ((otherTerm = te.next()) != null) {
            if (term.bytesEquals(otherTerm))
                continue;
            coiTermFilter.start(reader, field, otherTerm);
            if (coiTermFilter.skip(otherTerm) || coiTermFilter.skipByPopularity(otherTerm))
                continue;

            PostingsEnum pe2 = MultiFields.getTermPositionsEnum(reader, field, otherTerm);
            ret = pe2.advance(docId);
            if (ret == PostingsEnum.NO_MORE_DOCS)
                continue;
            freq = pe2.freq();
            boolean matchFound = false;
            for (int i = 0; i < freq && (!matchFound); i++) {
                int pos = pe2.nextPosition();
                int startpos = Math.max(0, pos - slop);
                int endpos = pos + slop;
                for (int prevpos = startpos; (prevpos <= endpos) && (!matchFound); prevpos++) {
                    if (termPos.get(prevpos)) {
                        // Add term to hashmap containing co-occurence
                        // counts for this term
                        Scorer pt = phraseTerms.get(otherTerm.utf8ToString());
                        if (pt == null) {
                            pt = new Scorer(baseTermFilter.getCurrentTermDocFreq(), otherTerm.utf8ToString(),
                                    coiTermFilter.getCurrentTermDocFreq());
                            phraseTerms.put(pt.coiTerm, pt);
                        }
                        pt.incCoiDocCount();
                        matchFound = true;
                    }
                }
            }
        }
        numDocsAnalyzed++;
    } // end of while loop

    // now sort and dump the top terms associated with this term.
    TopTerms topTerms = new TopTerms(maxCoiTermsPerTerm);
    for (String key : phraseTerms.keySet()) {
        Scorer pt = phraseTerms.get(key);
        topTerms.insertWithOverflow(pt);
    }
    Scorer[] tops = new Scorer[topTerms.size()];
    int tp = tops.length - 1;
    while (topTerms.size() > 0) {
        Scorer top = topTerms.pop();
        tops[tp--] = top;
    }
    return tops;
}