Java tutorial
/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.index; import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_FIELD; import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_FIELD_VAL; import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_VALUE; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME_LOWERCASE; import static org.languagetool.dev.index.PatternRuleQueryBuilder.SOURCE_FIELD_NAME; import java.io.File; import java.io.IOException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TimeLimitingCollector; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Counter; import org.languagetool.AnalyzedSentence; import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.rules.Rule; import org.languagetool.rules.RuleMatch; import org.languagetool.rules.patterns.PatternRule; import org.languagetool.tools.ContextTools; /** * A class with a main() method that takes a rule id and the location of the * index that runs the query on that index and prints all matches. * Will transparently handle rules that are not supported, i.e. run on the candidate matches * up to a limit. * * @author Tao Lin * @author Daniel Naber */ public class Searcher { private static boolean WIKITEXT_OUTPUT = false; private final Directory directory; private int maxHits = 1000; private int maxSearchTimeMillis = 5000; private IndexSearcher indexSearcher; private DirectoryReader reader; private boolean limitSearch = true; public Searcher(Directory directory) { this.directory = directory; } private void open() throws IOException { reader = DirectoryReader.open(directory); indexSearcher = new IndexSearcher(reader); //System.out.println("Opened index " + directory + " with " + indexSearcher.getIndexReader().numDocs() + " docs"); } private void close() throws IOException { if (reader != null) { reader.close(); } } public int getDocCount() throws IOException { try (DirectoryReader reader = DirectoryReader.open(directory)) { final IndexSearcher indexSearcher = new IndexSearcher(reader); return getDocCount(indexSearcher); } } private int getDocCount(IndexSearcher indexSearcher) throws IOException { final Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL); final TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1); if (search.totalHits != 1) { return -1; } final ScoreDoc scoreDoc = search.scoreDocs[0]; final Document doc = indexSearcher.doc(scoreDoc.doc); return Integer.parseInt(doc.get(MAX_DOC_COUNT_VALUE)); } public int getMaxHits() { return maxHits; } public void setMaxHits(int maxHits) { this.maxHits = maxHits; } public int getMaxSearchTimeMillis() { return maxSearchTimeMillis; } public void setMaxSearchTimeMillis(int maxSearchTimeMillis) { this.maxSearchTimeMillis = maxSearchTimeMillis; } public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language) throws IOException, UnsupportedPatternRuleException { // it seems wasteful to re-open the index every time, but I had strange problems (OOM, Array out of bounds, ...) // when not doing so... open(); try { final PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language); final Query query = patternRuleQueryBuilder.buildRelaxedQuery(rule); if (query == null) { throw new NullPointerException("Cannot search on null query for rule: " + rule.getId()); } System.out.println("Running query: " + query.toString(FIELD_NAME_LOWERCASE)); final SearchRunnable runnable = new SearchRunnable(indexSearcher, query, language, rule); final Thread searchThread = new Thread(runnable); searchThread.start(); try { // using a TimeLimitingCollector is not enough, as it doesn't cover all time required to // search for a complicated regex, so interrupt the whole thread instead: if (limitSearch) { // I don't know a simpler way to achieve this... searchThread.join(maxSearchTimeMillis); } else { searchThread.join(Integer.MAX_VALUE); } searchThread.interrupt(); } catch (InterruptedException e) { throw new RuntimeException("Search thread got interrupted for query " + query, e); } if (searchThread.isInterrupted()) { throw new SearchTimeoutException( "Search timeout of " + maxSearchTimeMillis + "ms reached for query " + query); } final Exception exception = runnable.getException(); if (exception != null) { if (exception instanceof SearchTimeoutException) { throw (SearchTimeoutException) exception; } throw new RuntimeException( "Exception during search for query " + query + " on rule " + rule.getId(), exception); } final List<MatchingSentence> matchingSentences = runnable.getMatchingSentences(); final int sentencesChecked = getSentenceCheckCount(query, indexSearcher); final SearcherResult searcherResult = new SearcherResult(matchingSentences, sentencesChecked, query); searcherResult.setHasTooManyLuceneMatches(runnable.hasTooManyLuceneMatches()); searcherResult.setLuceneMatchCount(runnable.getLuceneMatchCount()); if (runnable.hasTooManyLuceneMatches()) { // more potential matches than we can check in an acceptable time :-( searcherResult.setDocCount(maxHits); } else { searcherResult.setDocCount(getDocCount(indexSearcher)); } //TODO: the search itself could also timeout, don't just ignore that: //searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited); return searcherResult; } finally { close(); } } private PossiblyLimitedTopDocs getTopDocs(Query query, Sort sort) throws IOException { final TopFieldCollector topCollector = TopFieldCollector.create(sort, maxHits, true, false, false, false); final Counter clock = Counter.newCounter(true); final int waitMillis = 1000; // TODO: if we interrupt the whole thread anyway, do we still need the TimeLimitingCollector? final TimeLimitingCollector collector = new TimeLimitingCollector(topCollector, clock, maxSearchTimeMillis / waitMillis); collector.setBaseline(0); final Thread counterThread = new Thread() { @Override public void run() { final long startTime = System.currentTimeMillis(); while (true) { final long runTimeMillis = System.currentTimeMillis() - startTime; if (runTimeMillis > maxSearchTimeMillis) { // make sure there's no lingering thread for too long return; } clock.addAndGet(1); try { Thread.sleep(waitMillis); } catch (InterruptedException e) { throw new RuntimeException(e); } } } }; counterThread.setName("LuceneSearchTimeoutThread"); counterThread.start(); boolean timeLimitActivated = false; try { indexSearcher.search(query, collector); } catch (TimeLimitingCollector.TimeExceededException e) { timeLimitActivated = true; } return new PossiblyLimitedTopDocs(topCollector.topDocs(), timeLimitActivated); } List<PatternRule> getRuleById(String ruleId, Language language) throws IOException { List<PatternRule> rules = new ArrayList<>(); JLanguageTool langTool = new JLanguageTool(language); langTool.activateDefaultPatternRules(); for (Rule rule : langTool.getAllRules()) { if (rule.getId().equals(ruleId) && rule instanceof PatternRule) { rules.add((PatternRule) rule); } } if (rules.size() > 0) { return rules; } else { throw new PatternRuleNotFoundException(ruleId, language); } } private int getSentenceCheckCount(Query query, IndexSearcher indexSearcher) { final int indexSize = indexSearcher.getIndexReader().numDocs(); // we actually check up to maxHits sentences: // TODO: ?? final int sentencesChecked = Math.min(maxHits, indexSize); return sentencesChecked; } private List<MatchingSentence> findMatchingSentences(IndexSearcher indexSearcher, TopDocs topDocs, JLanguageTool languageTool) throws IOException { final List<MatchingSentence> matchingSentences = new ArrayList<>(); for (ScoreDoc match : topDocs.scoreDocs) { final Document doc = indexSearcher.doc(match.doc); final String sentence = doc.get(FIELD_NAME); final List<RuleMatch> ruleMatches = languageTool.check(sentence); if (ruleMatches.size() > 0) { final String source = doc.get(SOURCE_FIELD_NAME); final String title = doc.get(Indexer.TITLE_FIELD_NAME); final AnalyzedSentence analyzedSentence = languageTool.getAnalyzedSentence(sentence); final MatchingSentence matchingSentence = new MatchingSentence(sentence, source, title, analyzedSentence, ruleMatches); matchingSentences.add(matchingSentence); } } return matchingSentences; } private JLanguageTool getLanguageToolWithOneRule(Language lang, PatternRule patternRule) { final JLanguageTool langTool = new JLanguageTool(lang); for (Rule rule : langTool.getAllActiveRules()) { langTool.disableRule(rule.getId()); } langTool.addRule(patternRule); langTool.enableDefaultOffRule(patternRule.getId()); // rule might be off by default return langTool; } class PossiblyLimitedTopDocs { TopDocs topDocs; boolean resultIsTimeLimited; PossiblyLimitedTopDocs(TopDocs topDocs, boolean resultIsTimeLimited) { this.topDocs = topDocs; this.resultIsTimeLimited = resultIsTimeLimited; } } private static void ensureCorrectUsageOrExit(String[] args) { if (args.length < 3 || (args.length == 4 && !"--no_limit".equals(args[3]))) { System.err.println("Usage: Searcher <ruleId> <languageCode> <indexDir> [--no_limit]"); System.err.println("\truleId Id of the rule to search for (or comma-separated list of ids)"); System.err.println("\tlanguageCode short language code, e.g. 'en' for English"); System.err.println("\tindexDir path to a directory containing the index"); System.err.println("\t--no_limit do not limit search time"); System.exit(1); } } class SearchRunnable implements Runnable { private final IndexSearcher indexSearcher; private final Query query; private final Language language; private final PatternRule rule; private List<MatchingSentence> matchingSentences; private Exception exception; private boolean tooManyLuceneMatches; private int luceneMatchCount; SearchRunnable(IndexSearcher indexSearcher, Query query, Language language, PatternRule rule) { this.indexSearcher = indexSearcher; this.query = query; this.language = language; this.rule = rule; } @Override public void run() { try { final Sort sort = new Sort(new SortField("docCount", SortField.Type.INT)); // do not sort by relevance as this will move the shortest documents to the top final long t1 = System.currentTimeMillis(); final JLanguageTool languageTool = getLanguageToolWithOneRule(language, rule); final long langToolCreationTime = System.currentTimeMillis() - t1; final long t2 = System.currentTimeMillis(); final PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query, sort); final long luceneTime = System.currentTimeMillis() - t2; final long t3 = System.currentTimeMillis(); luceneMatchCount = limitedTopDocs.topDocs.totalHits; tooManyLuceneMatches = limitedTopDocs.topDocs.scoreDocs.length >= maxHits; matchingSentences = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool); System.out.println("Check done in " + langToolCreationTime + "/" + luceneTime + "/" + (System.currentTimeMillis() - t3) + "ms (LT creation/Lucene/matching) for " + limitedTopDocs.topDocs.scoreDocs.length + " docs"); } catch (Exception e) { exception = e; } } Exception getException() { return exception; } /** * There were more Lucene matches than we can actually check with LanguageTool in * an acceptable time, so real matches might be lost. */ boolean hasTooManyLuceneMatches() { return tooManyLuceneMatches; } int getLuceneMatchCount() { return luceneMatchCount; } List<MatchingSentence> getMatchingSentences() { return matchingSentences; } } private static ContextTools getContextTools(int contextSize) { final ContextTools contextTools = new ContextTools(); contextTools.setEscapeHtml(false); contextTools.setContextSize(contextSize); contextTools.setErrorMarkerStart("**"); contextTools.setErrorMarkerEnd("**"); return contextTools; } public static void main(String[] args) throws Exception { ensureCorrectUsageOrExit(args); final long startTime = System.currentTimeMillis(); final String[] ruleIds = args[0].split(","); final String languageCode = args[1]; final Language language = Language.getLanguageForShortName(languageCode); final File indexDir = new File(args[2]); final boolean limitSearch = args.length > 3 && "--no_limit".equals(args[3]); final Searcher searcher = new Searcher(new SimpleFSDirectory(indexDir)); if (!limitSearch) { searcher.setMaxHits(100_000); } searcher.limitSearch = limitSearch; final ContextTools contextTools = getContextTools(140); int totalMatches = 0; for (String ruleId : ruleIds) { final long ruleStartTime = System.currentTimeMillis(); for (PatternRule rule : searcher.getRuleById(ruleId, language)) { System.out.println("===== " + ruleId + "[" + rule.getSubId() + "] ========================================================="); final SearcherResult searcherResult = searcher.findRuleMatchesOnIndex(rule, language); int i = 1; if (searcherResult.getMatchingSentences().size() == 0) { System.out.println("[no matches]"); } for (MatchingSentence ruleMatch : searcherResult.getMatchingSentences()) { for (RuleMatch match : ruleMatch.getRuleMatches()) { String context = contextTools.getContext(match.getFromPos(), match.getToPos(), ruleMatch.getSentence()); if (WIKITEXT_OUTPUT) { ContextTools contextTools2 = getContextTools(0); String coveredText = contextTools2.getContext(match.getFromPos(), match.getToPos(), ruleMatch.getSentence()); coveredText = coveredText.replaceFirst("^\\.\\.\\.", "").replaceFirst("\\.\\.\\.$", ""); coveredText = coveredText.replaceFirst("^\\*\\*", "").replaceFirst("\\*\\*$", ""); String encodedTextWithQuotes = URLEncoder.encode("\"" + coveredText + "\"", "UTF-8"); String searchLink = "https://de.wikipedia.org/w/index.php?search=" + encodedTextWithQuotes + "&title=Spezial%3ASuche&go=Artikel"; context = context.replaceAll("\\*\\*.*?\\*\\*", "[" + searchLink + " " + coveredText + "]"); String encTitle = URLEncoder.encode(ruleMatch.getTitle(), "UTF-8"); String encodedText = URLEncoder.encode(coveredText, "UTF-8"); System.out.println("# [[" + ruleMatch.getTitle() + "]]: " + context + " ([http://wikipedia.ramselehof.de/wikiblame.php?user_lang=de&lang=de&project=wikipedia&article=" + encTitle + "&needle=" + encodedText + "&skipversions=0&ignorefirst=0&limit=500&searchmethod=int&order=desc&start=Start WikiBlame])"); } else { System.out.println(i + ": " + context + " [" + ruleMatch.getSource() + "]"); } } totalMatches += ruleMatch.getRuleMatches().size(); i++; } System.out.println("Time: " + (System.currentTimeMillis() - ruleStartTime) + "ms"); } } System.out.println( "Total time: " + (System.currentTimeMillis() - startTime) + "ms, " + totalMatches + " matches"); } }