List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:org.languagetool.dev.bigdata.LargestNGramFinder.java
License:Open Source License
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: " + LargestNGramFinder.class.getSimpleName() + " <ngramIndexDir>"); System.exit(1);// w w w . ja v a 2 s .c o m } FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); Fields fields = MultiFields.getFields(reader); long max = 0; String maxTerm = ""; Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next; while ((next = termsEnum.next()) != null) { String term = next.utf8ToString(); TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5); int docId = topDocs.scoreDocs[0].doc; Document document = reader.document(docId); long thisCount = Long.parseLong(document.get("count")); if (max < thisCount) { max = thisCount; maxTerm = term; } if (count % 10_000 == 0) { System.out.println(count + " -> " + topDocs.totalHits + " for " + term + " -> " + thisCount + ", max so far: " + max + " for '" + maxTerm + "'"); } count++; } System.out.println("Max: " + max + " for " + maxTerm); }
From source file:org.languagetool.dev.bigdata.NeededNGramCounter.java
License:Open Source License
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: " + NeededNGramCounter.class.getSimpleName() + " <ngramIndexDir>"); System.exit(1);//ww w . jav a 2 s .c om } Language lang = Languages.getLanguageForShortCode(LANG); String path = "/" + lang.getShortCode() + "/confusion_sets.txt"; Set<String> ngrams; try (InputStream confSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path)) { ngrams = new ConfusionSetLoader().loadConfusionSet(confSetStream).keySet(); } String ngramIndexDir = args[0]; FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath()); IndexReader reader = DirectoryReader.open(fsDir); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int i = 0; int needed = 0; int notNeeded = 0; BytesRef next; while ((next = termsEnum.next()) != null) { String term = next.utf8ToString(); String[] tmpTerms = term.split(" "); boolean ngramNeeded = false; for (String tmpTerm : tmpTerms) { if (ngrams.contains(tmpTerm)) { ngramNeeded = true; break; } } if (ngramNeeded) { //System.out.println("needed: " + term); needed++; } else { //System.out.println("not needed: " + term); notNeeded++; } if (i % 500_000 == 0) { System.out.println(i + "/" + terms.getDocCount()); } i++; } System.out.println("language : " + LANG); System.out.println("ngram index : " + ngramIndexDir); System.out.println("needed ngrams : " + needed); System.out.println("not needed ngrams: " + notNeeded); }
From source file:org.languagetool.dev.HomophoneOccurrenceDumper.java
License:Open Source License
private TermsEnum getIterator() throws IOException { LuceneSearcher luceneSearcher = getLuceneSearcher(3); Fields fields = MultiFields.getFields(luceneSearcher.getReader()); Terms terms = fields.terms("ngram"); return terms.iterator(null); }
From source file:org.lexevs.dao.index.lucene.v2010.metadata.LuceneMetadataDao.java
License:Open Source License
@Override public AbsoluteCodingSchemeVersionReferenceList listCodingSchemes() { AbsoluteCodingSchemeVersionReferenceList result = new AbsoluteCodingSchemeVersionReferenceList(); try {/*w w w .j a v a 2s. co m*/ final TermsEnum te = luceneIndexTemplate.executeInIndexReader(new IndexReaderCallback<TermsEnum>() { @Override public TermsEnum doInIndexReader(IndexReader indexReader) throws Exception { TermsEnum termsEnum = null; Fields fields = MultiFields.getFields(indexReader); if (fields != null) { Terms terms = fields.terms("codingSchemeNameVersion"); if (terms != null) { termsEnum = terms.iterator(); } } return termsEnum; } }); // TODO see Multifield for a better implementation of this. BytesRef text = null; while ((te != null) && (text = te.next()) != null) { Query temp = new TermQuery(new Term("codingSchemeNameVersion", text.utf8ToString())); List<ScoreDoc> d = this.luceneIndexTemplate.search(temp, null); if (d.size() > 0) { ScoreDoc doc = d.get(0); AbsoluteCodingSchemeVersionReference acsvr = new AbsoluteCodingSchemeVersionReference(); Document document = luceneIndexTemplate.getDocumentById(doc.doc); acsvr.setCodingSchemeURN(document.get("codingSchemeRegisteredName")); acsvr.setCodingSchemeVersion(document.get("codingSchemeVersion")); result.addAbsoluteCodingSchemeVersionReference(acsvr); } } return result; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:org.meresco.lucene.Lucene.java
License:Open Source License
public LuceneResponse similarDocuments(String identifier) throws Throwable { SearcherAndTaxonomy reference = data.getManager().acquire(); try {//from ww w . j a v a2 s . c o m Query idQuery = new TermQuery(new Term(ID_FIELD, identifier)); TopDocs topDocs = reference.searcher.search(idQuery, 1); if (topDocs.totalHits == 0) return new LuceneResponse(0); int docId = topDocs.scoreDocs[0].doc; IndexReader reader = reference.searcher.getIndexReader(); CommonTermsQuery commonQuery = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, 0.1f); Fields termVectors = reader.getTermVectors(docId); if (termVectors == null) return new LuceneResponse(0); for (String field : termVectors) { TermsEnum iterator = termVectors.terms(field).iterator(null); BytesRef b; while ((b = iterator.next()) != null) { Term term = new Term(field, b.utf8ToString()); commonQuery.add(term); } } BooleanQuery query = new BooleanQuery(); query.add(idQuery, Occur.MUST_NOT); query.add(commonQuery, Occur.MUST); return executeQuery(query); } finally { data.getManager().release(reference); } }
From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java
License:Open Source License
@Test public void testIndexReleaseGroupNumReleases() throws Exception { addReleaseGroupOne();//w ww. j a v a2s.co m RAMDirectory ramDir = new RAMDirectory(); createIndex(ramDir); IndexReader ir = DirectoryReader.open(ramDir); assertEquals(2, ir.numDocs()); { Fields fields = MultiFields.getFields(ir); Terms terms = fields.terms(ReleaseGroupIndexField.NUM_RELEASES.getName()); TermsEnum termsEnum = terms.iterator(null); termsEnum.next(); assertEquals(1, NumericUtils.prefixCodedToInt(termsEnum.term())); } }
From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java
License:Open Source License
@Test public void testIndexReleaseGroupMultipleArtists() throws Exception { addReleaseGroupFour();/*from w w w .java 2 s. c o m*/ RAMDirectory ramDir = new RAMDirectory(); createIndex(ramDir); IndexReader ir = DirectoryReader.open(ramDir); assertEquals(2, ir.numDocs()); { Document doc = ir.document(1); Fields fields = MultiFields.getFields(ir); Terms terms = fields.terms(ReleaseGroupIndexField.ARTIST_NAME.getName()); TermsEnum tr = terms.iterator(null); tr.next(); assertEquals(1, tr.docFreq()); assertEquals("cincinnati", tr.term().utf8ToString()); tr.next(); assertEquals("erich", tr.term().utf8ToString()); tr.next(); assertEquals("kunzel", tr.term().utf8ToString()); tr.next(); assertEquals("kunzstel", tr.term().utf8ToString()); tr.next(); assertEquals("kunzstelein", tr.term().utf8ToString()); tr.next(); assertEquals("orchestra", tr.term().utf8ToString()); tr.next(); assertEquals("pops", tr.term().utf8ToString()); tr.next(); assertEquals("the", tr.term().utf8ToString()); terms = fields.terms(ReleaseGroupIndexField.ARTIST_ID.getName()); tr = terms.iterator(null); tr.next(); assertEquals(1, tr.docFreq()); assertEquals("99845d0c-f239-4051-a6b1-4b5e9f7ede0b", tr.term().utf8ToString()); tr.next(); assertEquals("d8fbd94c-cd06-4e8b-a559-761ad969d07e", tr.term().utf8ToString()); tr.next(); terms = fields.terms(ReleaseGroupIndexField.ARTIST_NAMECREDIT.getName()); tr = terms.iterator(null); tr.next(); assertEquals(1, tr.docFreq()); assertEquals("cincinnati", tr.term().utf8ToString()); tr.next(); assertEquals("erich", tr.term().utf8ToString()); tr.next(); assertEquals("kunzel", tr.term().utf8ToString()); tr.next(); assertEquals("pops", tr.term().utf8ToString()); assertEquals("Epics", doc.getFields(ReleaseGroupIndexField.RELEASEGROUP.getName())[0].stringValue()); assertEquals("efd2ace2-b3b9-305f-8a53-9803595c0e37", doc.getFields(ReleaseGroupIndexField.RELEASEGROUP_ID.getName())[0].stringValue()); ArtistCredit ac = ArtistCreditHelper .unserialize(doc.get(ReleaseGroupIndexField.ARTIST_CREDIT.getName())); assertNotNull(ac); assertEquals("Erich Kunzel", ac.getNameCredit().get(0).getArtist().getName()); assertEquals("Cincinnati Pops", ac.getNameCredit().get(1).getName()); assertEquals("The Cincinnati Pops Orchestra", ac.getNameCredit().get(1).getArtist().getName()); assertEquals(2, ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().size()); assertNull(ac.getNameCredit().get(1).getArtist().getAliasList()); assertEquals("Erich Kunzstel", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(0).getContent()); assertEquals("Erich Kunzstelein", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(1).getContent()); assertEquals("Kunzstel, Erich", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(0).getSortName()); assertEquals("Kunzstelein, Erich", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(1).getSortName()); assertEquals("en", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(0).getLocale()); assertEquals("de", ac.getNameCredit().get(0).getArtist().getAliasList().getAlias().get(1).getLocale()); } ir.close(); }
From source file:org.neo4j.kernel.api.impl.schema.verification.PartitionedUniquenessVerifier.java
License:Open Source License
private Terms termsForField(String fieldName) throws IOException { List<Terms> terms = new ArrayList<>(); List<ReaderSlice> readerSlices = new ArrayList<>(); for (LeafReader leafReader : allLeafReaders()) { Fields fields = leafReader.fields(); Terms leafTerms = fields.terms(fieldName); if (leafTerms != null) { ReaderSlice readerSlice = new ReaderSlice(0, Math.toIntExact(leafTerms.size()), 0); terms.add(leafTerms);/*from w w w.ja v a 2s. c o m*/ readerSlices.add(readerSlice); } } Terms[] termsArray = terms.toArray(new Terms[terms.size()]); ReaderSlice[] readerSlicesArray = readerSlices.toArray(new ReaderSlice[readerSlices.size()]); return new MultiTerms(termsArray, readerSlicesArray); }
From source file:org.neo4j.kernel.api.impl.schema.verification.SimpleUniquenessVerifier.java
License:Open Source License
@Override public void verify(PropertyAccessor accessor, int propKeyId) throws IndexEntryConflictException, IOException { try {/*from w ww .j a v a 2 s.co m*/ DuplicateCheckingCollector collector = new DuplicateCheckingCollector(accessor, propKeyId); IndexSearcher searcher = indexSearcher(); for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) { Fields fields = leafReaderContext.reader().fields(); for (String field : fields) { if (LuceneDocumentStructure.NODE_ID_KEY.equals(field)) { continue; } TermsEnum terms = LuceneDocumentStructure.originalTerms(fields.terms(field), field); BytesRef termsRef; while ((termsRef = terms.next()) != null) { if (terms.docFreq() > 1) { collector.reset(); searcher.search(new TermQuery(new Term(field, termsRef)), collector); } } } } } catch (IOException e) { Throwable cause = e.getCause(); if (cause instanceof IndexEntryConflictException) { throw (IndexEntryConflictException) cause; } throw e; } }
From source file:org.nlp4l.lucene.BuddyWordsFinder.java
License:Apache License
public Scorer[] findCoincidentalTerms(String field, BytesRef term) throws IOException { baseTermFilter.start(reader, field, term); if (baseTermFilter.skip(term) || baseTermFilter.skipByPopularity(term)) return null; //System.out.println(term.utf8ToString()); Bits liveDocs = MultiFields.getLiveDocs(reader); PostingsEnum de = MultiFields.getTermDocsEnum(reader, field, term); if (de == null) return null; int numDocsAnalyzed = 0; phraseTerms.clear();/*from ww w. j a va 2 s. c o m*/ while (de.nextDoc() != PostingsEnum.NO_MORE_DOCS && numDocsAnalyzed < maxDocsToAnalyze) { int docId = de.docID(); //first record all of the positions of the term in a bitset which // represents terms in the current doc. int freq = de.freq(); PostingsEnum pe = MultiFields.getTermPositionsEnum(reader, field, term); int ret = pe.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; termPos.clear(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < termPos.size()) termPos.set(pos); } // now look at all OTHER terms in this doc and see if they are // positioned in a pre-defined sized window around the current term Fields vectors = reader.getTermVectors(docId); // check it has term vectors if (vectors == null) return null; Terms vector = vectors.terms(field); // check it has position info if (vector == null || !vector.hasPositions()) return null; TermsEnum te = vector.iterator(); BytesRef otherTerm = null; while ((otherTerm = te.next()) != null) { if (term.bytesEquals(otherTerm)) continue; coiTermFilter.start(reader, field, otherTerm); if (coiTermFilter.skip(otherTerm) || coiTermFilter.skipByPopularity(otherTerm)) continue; PostingsEnum pe2 = MultiFields.getTermPositionsEnum(reader, field, otherTerm); ret = pe2.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; freq = pe2.freq(); boolean matchFound = false; for (int i = 0; i < freq && (!matchFound); i++) { int pos = pe2.nextPosition(); int startpos = Math.max(0, pos - slop); int endpos = pos + slop; for (int prevpos = startpos; (prevpos <= endpos) && (!matchFound); prevpos++) { if (termPos.get(prevpos)) { // Add term to hashmap containing co-occurence // counts for this term Scorer pt = phraseTerms.get(otherTerm.utf8ToString()); if (pt == null) { pt = new Scorer(baseTermFilter.getCurrentTermDocFreq(), otherTerm.utf8ToString(), coiTermFilter.getCurrentTermDocFreq()); phraseTerms.put(pt.coiTerm, pt); } pt.incCoiDocCount(); matchFound = true; } } } } numDocsAnalyzed++; } // end of while loop // now sort and dump the top terms associated with this term. TopTerms topTerms = new TopTerms(maxCoiTermsPerTerm); for (String key : phraseTerms.keySet()) { Scorer pt = phraseTerms.get(key); topTerms.insertWithOverflow(pt); } Scorer[] tops = new Scorer[topTerms.size()]; int tp = tops.length - 1; while (topTerms.size() > 0) { Scorer top = topTerms.pop(); tops[tp--] = top; } return tops; }