Java tutorial
/* begin license * * * "Meresco Lucene" is a set of components and tools to integrate Lucene (based on PyLucene) into Meresco * * Copyright (C) 2015 Koninklijke Bibliotheek (KB) http://www.kb.nl * Copyright (C) 2015-2016 Seecr (Seek You Too B.V.) http://seecr.nl * * This file is part of "Meresco Lucene" * * "Meresco Lucene" is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * "Meresco Lucene" is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with "Meresco Lucene"; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * end license */ package org.meresco.lucene.suggestion; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.Executors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queries.ChainedFilter; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.CachingWrapperFilter; import org.apache.lucene.search.QueryWrapperFilter; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import org.meresco.lucene.Utils; import org.meresco.lucene.search.SuperIndexSearcher; import org.meresco.lucene.search.TopScoreDocSuperCollector; import org.meresco.lucene.suggestion.SuggestionIndex.IndexingState; public class SuggestionNGramIndex { private static final String SUGGESTION_FIELDNAME = "__suggestion__"; private static final String CONCEPT_URI_FIELDNAME = "type"; private static final String BIGRAM_FIELDNAME = "__bigram__"; private static final String TRIGRAM_FIELDNAME = "__trigram__"; private static final String CREATOR_FIELDNAME = "creator"; private static final String KEY_FIELDNAME = "__keys__"; private static final NGramAnalyzer BIGRAM_ANALYZER = new NGramAnalyzer(2, 2); private static final NGramAnalyzer TRIGRAM_ANALYZER = new NGramAnalyzer(3, 3); private Field suggestionField = new Field(SUGGESTION_FIELDNAME, "", SuggestionIndex.SIMPLE_STORED_STRING_FIELD); private Field conceptUriField = new Field(CONCEPT_URI_FIELDNAME, "", SuggestionIndex.SIMPLE_STORED_STRING_FIELD); private Field creatorField = new Field(CREATOR_FIELDNAME, "", SuggestionIndex.SIMPLE_STORED_STRING_FIELD); private Field keyField = new BinaryDocValuesField(KEY_FIELDNAME, new BytesRef()); private final int maxCommitCount; private final IndexWriter writer; private final FSDirectory directory; private int commitCount; public long indexingTermsCount; public long totalTerms; public SuggestionNGramIndex(String directory) throws IOException { this(directory, 1); } public SuggestionNGramIndex(String directory, int commitCount) throws IOException { this.maxCommitCount = commitCount; this.directory = FSDirectory.open(new File(directory)); IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, new StandardAnalyzer()); this.writer = new IndexWriter(this.directory, config); this.writer.commit(); } public void createSuggestions(IndexReader reader, String suggestionFieldname, String keyFieldname, IndexingState indexingState) throws IOException { Bits liveDocs = MultiFields.getLiveDocs(reader); List<AtomicReaderContext> leaves = reader.leaves(); Terms terms = MultiFields.getTerms(reader, suggestionFieldname); if (terms == null) return; TermsEnum termsEnum = terms.iterator(null); BytesRef term; while ((term = termsEnum.next()) != null) { List<Long> keys = new ArrayList<>(); DocsEnum docsEnum = termsEnum.docs(liveDocs, null, DocsEnum.FLAG_NONE); while (true) { int docId = docsEnum.nextDoc(); if (docId == DocsEnum.NO_MORE_DOCS) { break; } keys.add(keyForDoc(docId, leaves, keyFieldname)); } if (keys.size() > 0) { String[] values = term.utf8ToString().split(SuggestionIndex.CONCAT_MARKER.replace("$", "\\$")); indexNGram(values[0], values[1], values[2], keys); indexingState.count++; } } this.commit(); } private Long keyForDoc(int docId, List<AtomicReaderContext> leaves, String keyFieldname) throws IOException { AtomicReaderContext context = leaves.get(ReaderUtil.subIndex(docId, leaves)); return context.reader().getNumericDocValues(keyFieldname).get(docId - context.docBase); } private void maybeCommitAfterUpdate() throws IOException { this.commitCount++; if (this.commitCount >= this.maxCommitCount) { this.commit(); } } public void commit() throws IOException { this.writer.commit(); this.commitCount = 0; } public void close() throws IOException { this.writer.close(); } private void indexNGram(String type, String creator, String term, List<Long> keys) throws IOException { Document doc = new Document(); this.suggestionField.setStringValue(term); doc.add(this.suggestionField); if (!type.equals("")) { this.conceptUriField.setStringValue(type); doc.add(this.conceptUriField); } if (!creator.equals("")) { this.creatorField.setStringValue(creator); doc.add(this.creatorField); } for (String n : ngrams(term, false)) { doc.add(new Field(BIGRAM_FIELDNAME, n, SuggestionIndex.SIMPLE_NOT_STORED_STRING_FIELD)); } for (String n : ngrams(term, true)) { doc.add(new Field(TRIGRAM_FIELDNAME, n, SuggestionIndex.SIMPLE_NOT_STORED_STRING_FIELD)); } keyField.setBytesValue(new BytesRef(Utils.join(keys, "|"))); doc.add(keyField); this.writer.addDocument(doc); maybeCommitAfterUpdate(); } public static List<String> ngrams(String s, Boolean trigram) throws IOException { List<String> ngram = new ArrayList<String>(); Analyzer ngramAnalyzer = trigram ? TRIGRAM_ANALYZER : BIGRAM_ANALYZER; TokenStream stream = ngramAnalyzer.tokenStream("ignored", s); stream.reset(); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { ngram.add(termAttribute.toString()); } stream.close(); return ngram; } public Reader createReader(Map<String, DocIdSet> keySetFilters) throws IOException { return new Reader(directory, keySetFilters); } public static class Reader { private FSDirectory directory; private DirectoryReader reader; private IndexSearcher searcher; private Map<String, DocIdSet> filterKeySets; private Map<String, Filter> keySetFilters = new HashMap<>(); private Map<String, Filter> filterCache = new HashMap<>(); public Reader(FSDirectory directory, Map<String, DocIdSet> filterKeySets) throws IOException { this.directory = directory; this.filterKeySets = filterKeySets; reopen(); } public int numDocs() throws IOException { return this.reader.numDocs(); } public Suggestion[] suggest(String value, Boolean trigram, String[] filters) throws Exception { return suggest(value, trigram, filters, null); } public Suggestion[] suggest(String value, Boolean trigram, String[] filters, String keySetName) throws Exception { String ngramFieldName = trigram ? TRIGRAM_FIELDNAME : BIGRAM_FIELDNAME; BooleanQuery query = new BooleanQuery(); List<String> ngrams = ngrams(value, trigram); int SKIP_LAST_DOLLAR = 1; int ngramSize = ngrams.size() - SKIP_LAST_DOLLAR; for (int i = 0; i < ngramSize; i++) { query.add(new TermQuery(new Term(ngramFieldName, ngrams.get(i))), BooleanClause.Occur.MUST); } Filter keySetFilter = this.keySetFilters.get(keySetName); if (keySetFilter == null) { DocIdSet keys = filterKeySets.get(keySetName); if (keys != null) { keySetFilter = new CachingWrapperFilter(new SuggestionNGramKeysFilter(keys, KEY_FIELDNAME)); this.keySetFilters.put(keySetName, keySetFilter); } } Filter filter = createFilter(filters); if (filter == null) { filter = keySetFilter; } else if (keySetFilter != null) { filter = new ChainedFilter(new Filter[] { filter, keySetFilter }, ChainedFilter.AND); } TopDocs t = searcher.search(query, filter, 25); Suggestion[] suggestions = new Suggestion[t.totalHits < 25 ? t.totalHits : 25]; int i = 0; for (ScoreDoc d : t.scoreDocs) { Document doc = searcher.doc(d.doc); suggestions[i++] = new Suggestion(doc.get(SUGGESTION_FIELDNAME), doc.get(CONCEPT_URI_FIELDNAME), doc.get(CREATOR_FIELDNAME), d.score); } return suggestions; } public Filter createFilter(String[] filters) { if (filters == null || filters.length == 0) return null; Filter[] chain = new Filter[filters.length]; for (int i = 0; i < filters.length; i++) { String filterString = filters[i]; Filter filter = filterCache.get(filterString); if (filter == null) { String[] f = filterString.split("=", 2); filter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term(f[0], f[1])))); filterCache.put(filterString, filter); } chain[i] = filter; } if (chain.length == 1) return chain[0]; return new ChainedFilter(chain, ChainedFilter.OR); } public void close() throws IOException { this.reader.close(); } public synchronized void reopen() throws IOException { this.reader = DirectoryReader.open(directory); this.keySetFilters.clear(); this.filterCache.clear(); this.searcher = new IndexSearcher(this.reader, Executors.newFixedThreadPool(10)); } } public static class Suggestion { public String suggestion; public String type; public String creator; public float score; public Suggestion(String suggestion, String type, String creator, float score) { this.suggestion = suggestion; this.type = type; this.creator = creator; this.score = score; } } }