List of usage examples for org.apache.lucene.util.automaton CharacterRunAutomaton CharacterRunAutomaton
public CharacterRunAutomaton(Automaton a)
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * //from w w w . j av a 2s . c o m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but its a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * //from w w w. j a v a2 s . co m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but it's a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:org.apache.solr.analysis.MockTokenFilterFactory.java
License:Apache License
/** Creates a new MockTokenizerFactory */ public MockTokenFilterFactory(Map<String, String> args) { super(args);/* w w w. j a v a 2 s . co m*/ String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false); String stopregex = get(args, "stopregex"); if (null != stopset) { if (null != stopregex) { throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified."); } if ("english".equalsIgnoreCase(stopset)) { filter = MockTokenFilter.ENGLISH_STOPSET; } else { // must be "empty" filter = MockTokenFilter.EMPTY_STOPSET; } } else if (null != stopregex) { RegExp regex = new RegExp(stopregex); filter = new CharacterRunAutomaton(regex.toAutomaton()); } else { throw new IllegalArgumentException( "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified."); } enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
From source file:org.apache.solr.core.MockTokenFilterFactory.java
License:Apache License
/** Creates a new MockTokenizerFactory */ public MockTokenFilterFactory(Map<String, String> args) { super(args);/* w w w. j a va2s . c o m*/ String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false); String stopregex = get(args, "stopregex"); if (null != stopset) { if (null != stopregex) { throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified."); } if ("english".equalsIgnoreCase(stopset)) { filter = MockTokenFilter.ENGLISH_STOPSET; } else { // must be "empty" filter = MockTokenFilter.EMPTY_STOPSET; } } else if (null != stopregex) { RegExp regex = new RegExp(stopregex); filter = new CharacterRunAutomaton(regex.toAutomaton()); } else { throw new IllegalArgumentException( "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified."); } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
From source file:org.codelibs.elasticsearch.common.xcontent.support.XContentMapValues.java
License:Apache License
/** * Returns a function that filters a document map based on the given include and exclude rules. * @see #filter(Map, String[], String[]) for details *//* www . j a v a 2 s. c om*/ public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) { CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString()); CharacterRunAutomaton include; if (includes == null || includes.length == 0) { include = matchAllAutomaton; } else { Automaton includeA = Regex.simpleMatchToAutomaton(includes); includeA = makeMatchDotsInFieldNames(includeA); include = new CharacterRunAutomaton(includeA); } Automaton excludeA; if (excludes == null || excludes.length == 0) { excludeA = Automata.makeEmpty(); } else { excludeA = Regex.simpleMatchToAutomaton(excludes); excludeA = makeMatchDotsInFieldNames(excludeA); } CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA); // NOTE: We cannot use Operations.minus because of the special case that // we want all sub properties to match as soon as an object matches return (map) -> filter(map, include, 0, exclude, 0, matchAllAutomaton); }
From source file:org.easynet.resource.queryparser.QueryParserTestBase.java
License:Apache License
public void testBoost() throws Exception { CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); QueryParser qp = getParserConfig(oneStopAnalyzer); Query q = getQuery("on^1.0", qp); assertNotNull(q);/*from w ww. j a va 2s .c om*/ q = getQuery("\"hello\"^2.0", qp); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = getQuery("hello^2.0", qp); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = getQuery("\"on\"^1.0", qp); assertNotNull(q); Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); QueryParser qp2 = getParserConfig(a2); q = getQuery("the^3", qp2); // "the" is a stop word so the result is an empty query: assertNotNull(q); assertEquals("", q.toString()); assertEquals(1.0f, q.getBoost(), 0.01f); }
From source file:org.easynet.resource.queryparser.QueryParserTestBase.java
License:Apache License
public void testStopwords() throws Exception { CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton()); QueryParser qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet)); Query result = getQuery("field:the OR field:foo", qp); assertNotNull("result is null and it shouldn't be", result); assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery); assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 0, ((BooleanQuery) result).clauses().size() == 0); result = getQuery("field:woo OR field:the", qp); assertNotNull("result is null and it shouldn't be", result); assertTrue("result is not a TermQuery", result instanceof TermQuery); result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", qp); assertNotNull("result is null and it shouldn't be", result); assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery); if (VERBOSE)/* w w w. j a va2 s . c o m*/ System.out.println("Result: " + result); assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2); }
From source file:org.easynet.resource.queryparser.QueryParserTestBase.java
License:Apache License
public void testPhraseQueryPositionIncrements() throws Exception { CharacterRunAutomaton stopStopList = new CharacterRunAutomaton( new RegExp("[sS][tT][oO][pP]").toAutomaton()); QueryParser qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); qp.setEnablePositionIncrements(true); PhraseQuery phraseQuery = new PhraseQuery(); phraseQuery.add(new Term("field", "1")); phraseQuery.add(new Term("field", "2"), 2); assertEquals(phraseQuery, getQuery("\"1 stop 2\"", qp)); }
From source file:org.elasticsearch.index.reindex.TransportReindexAction.java
License:Apache License
/** * Build the {@link CharacterRunAutomaton} that represents the reindex-from-remote whitelist and make sure that it doesn't whitelist * the world./*from w ww . j ava2 s . c o m*/ */ static CharacterRunAutomaton buildRemoteWhitelist(List<String> whitelist) { if (whitelist.isEmpty()) { return new CharacterRunAutomaton(Automata.makeEmpty()); } Automaton automaton = Regex.simpleMatchToAutomaton(whitelist.toArray(Strings.EMPTY_ARRAY)); automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); if (Operations.isTotal(automaton)) { throw new IllegalArgumentException("Refusing to start because whitelist " + whitelist + " accepts all addresses. " + "This would allow users to reindex-from-remote any URL they like effectively having Elasticsearch make HTTP GETs " + "for them."); } return new CharacterRunAutomaton(automaton); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test filtering two string fields//from ww w . j a v a 2s .c o m */ public void testIndexed() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); // add document with 2 fields Document doc = new Document(); doc.add(new StringField("fieldA", "test", Field.Store.NO)); doc.add(new StringField("fieldB", "test", Field.Store.NO)); iw.addDocument(doc); // open reader DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(Automata.makeString("fieldA"))); // see only one field LeafReader segmentReader = ir.leaves().get(0).reader(); Set<String> seenFields = new HashSet<>(); for (FieldInfo info : segmentReader.getFieldInfos()) { seenFields.add(info.name); } assertEquals(Collections.singleton("fieldA"), seenFields); assertNotNull(segmentReader.terms("fieldA")); assertNull(segmentReader.terms("fieldB")); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }