List of usage examples for org.apache.lucene.analysis BaseTokenStreamTestCase escape
public static String escape(String s)
From source file:org.elasticsearch.bwcompat.BasicAnalysisBackwardCompatibilityIT.java
License:Apache License
/** * Simple upgrade test for analyzers to make sure they analyze to the same tokens after upgrade * TODO we need this for random tokenizers / tokenfilters as well *///from www . j av a 2 s. c o m @Test public void testAnalyzerTokensAfterUpgrade() throws IOException, ExecutionException, InterruptedException { int numFields = randomIntBetween(PreBuiltAnalyzers.values().length, PreBuiltAnalyzers.values().length * 10); StringBuilder builder = new StringBuilder(); String[] fields = new String[numFields * 2]; int fieldId = 0; for (int i = 0; i < fields.length; i++) { fields[i++] = "field_" + fieldId++; String analyzer = randomAnalyzer(); fields[i] = "type=string,analyzer=" + analyzer; } assertAcked(prepareCreate("test").addMapping("type", fields).setSettings(indexSettings())); ensureYellow(); InputOutput[] inout = new InputOutput[numFields]; for (int i = 0; i < numFields; i++) { String input; Matcher matcher; do { // In Lucene 4.10, a bug was fixed in StandardTokenizer which was causing breaks on complex characters. // The bug was fixed without backcompat Version handling, so testing between >=4.10 vs <= 4.9 can // cause differences when the random string generated contains these complex characters. To mitigate // the problem, we skip any strings containing these characters. // TODO: only skip strings containing complex chars when comparing against ES <= 1.3.x input = TestUtil.randomAnalysisString(getRandom(), 100, false); matcher = complexUnicodeChars.matcher(input); } while (matcher.find()); AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", input).setField("field_" + i) .get(); inout[i] = new InputOutput(test, input, "field_" + i); } logClusterState(); boolean upgraded; do { logClusterState(); upgraded = backwardsCluster().upgradeOneNode(); ensureYellow(); } while (upgraded); for (int i = 0; i < inout.length; i++) { InputOutput inputOutput = inout[i]; AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", inputOutput.input) .setField(inputOutput.field).get(); List<AnalyzeResponse.AnalyzeToken> tokens = test.getTokens(); List<AnalyzeResponse.AnalyzeToken> expectedTokens = inputOutput.response.getTokens(); assertThat( "size mismatch field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: " + BaseTokenStreamTestCase.escape(inputOutput.input), expectedTokens.size(), equalTo(tokens.size())); for (int j = 0; j < tokens.size(); j++) { String msg = "failed for term: " + expectedTokens.get(j).getTerm() + " field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: " + BaseTokenStreamTestCase.escape(inputOutput.input); assertThat(msg, BaseTokenStreamTestCase.escape(expectedTokens.get(j).getTerm()), equalTo(BaseTokenStreamTestCase.escape(tokens.get(j).getTerm()))); assertThat(msg, expectedTokens.get(j).getPosition(), equalTo(tokens.get(j).getPosition())); assertThat(msg, expectedTokens.get(j).getStartOffset(), equalTo(tokens.get(j).getStartOffset())); assertThat(msg, expectedTokens.get(j).getEndOffset(), equalTo(tokens.get(j).getEndOffset())); assertThat(msg, expectedTokens.get(j).getType(), equalTo(tokens.get(j).getType())); } } }
From source file:org.elasticsearch.bwcompat.BasicAnalysisBackwardCompatibilityTests.java
License:Apache License
/** * Simple upgrade test for analyzers to make sure they analyze to the same tokens after upgrade * TODO we need this for random tokenizers / tokenfilters as well *//* w w w . j av a2s . c om*/ @Test public void testAnalyzerTokensAfterUpgrade() throws IOException, ExecutionException, InterruptedException { int numFields = randomIntBetween(PreBuiltAnalyzers.values().length, PreBuiltAnalyzers.values().length * 10); String[] fields = new String[numFields * 2]; int fieldId = 0; for (int i = 0; i < fields.length; i++) { fields[i++] = "field_" + fieldId++; String analyzer = randomAnalyzer(); fields[i] = "type=string,analyzer=" + analyzer; } assertAcked(prepareCreate("test").addMapping("type", fields).setSettings(indexSettings())); ensureYellow(); InputOutput[] inout = new InputOutput[numFields]; for (int i = 0; i < numFields; i++) { String input; Matcher matcher; do { // In Lucene 4.10, a bug was fixed in StandardTokenizer which was causing breaks on complex characters. // The bug was fixed without backcompat Version handling, so testing between >=4.10 vs <= 4.9 can // cause differences when the random string generated contains these complex characters. To mitigate // the problem, we skip any strings containing these characters. // TODO: only skip strings containing complex chars when comparing against ES <= 1.3.x input = TestUtil.randomAnalysisString(getRandom(), 100, false); matcher = complexUnicodeChars.matcher(input); } while (matcher.find()); AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", input).setField("field_" + i) .get(); inout[i] = new InputOutput(test, input, "field_" + i); } logClusterState(); boolean upgraded; do { logClusterState(); upgraded = backwardsCluster().upgradeOneNode(); ensureYellow(); } while (upgraded); for (int i = 0; i < inout.length; i++) { InputOutput inputOutput = inout[i]; AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", inputOutput.input) .setField(inputOutput.field).get(); List<AnalyzeResponse.AnalyzeToken> tokens = test.getTokens(); List<AnalyzeResponse.AnalyzeToken> expectedTokens = inputOutput.response.getTokens(); assertThat( "size mismatch field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: " + BaseTokenStreamTestCase.escape(inputOutput.input), expectedTokens.size(), equalTo(tokens.size())); for (int j = 0; j < tokens.size(); j++) { String msg = "failed for term: " + expectedTokens.get(j).getTerm() + " field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: " + BaseTokenStreamTestCase.escape(inputOutput.input); assertThat(msg, BaseTokenStreamTestCase.escape(expectedTokens.get(j).getTerm()), equalTo(BaseTokenStreamTestCase.escape(tokens.get(j).getTerm()))); assertThat(msg, expectedTokens.get(j).getPosition(), equalTo(tokens.get(j).getPosition())); assertThat(msg, expectedTokens.get(j).getStartOffset(), equalTo(tokens.get(j).getStartOffset())); assertThat(msg, expectedTokens.get(j).getEndOffset(), equalTo(tokens.get(j).getEndOffset())); assertThat(msg, expectedTokens.get(j).getType(), equalTo(tokens.get(j).getType())); } } }