Example usage for org.apache.lucene.analysis BaseTokenStreamTestCase escape

List of usage examples for org.apache.lucene.analysis BaseTokenStreamTestCase escape

Introduction

In this page you can find the example usage for org.apache.lucene.analysis BaseTokenStreamTestCase escape.

Prototype

public static String escape(String s) 

Source Link

Usage

From source file:org.elasticsearch.bwcompat.BasicAnalysisBackwardCompatibilityIT.java

License:Apache License

/**
 * Simple upgrade test for analyzers to make sure they analyze to the same tokens after upgrade
 * TODO we need this for random tokenizers / tokenfilters as well
 *///from  www .  j av a  2  s. c  o m
@Test
public void testAnalyzerTokensAfterUpgrade() throws IOException, ExecutionException, InterruptedException {
    int numFields = randomIntBetween(PreBuiltAnalyzers.values().length, PreBuiltAnalyzers.values().length * 10);
    StringBuilder builder = new StringBuilder();
    String[] fields = new String[numFields * 2];
    int fieldId = 0;
    for (int i = 0; i < fields.length; i++) {
        fields[i++] = "field_" + fieldId++;
        String analyzer = randomAnalyzer();
        fields[i] = "type=string,analyzer=" + analyzer;
    }
    assertAcked(prepareCreate("test").addMapping("type", fields).setSettings(indexSettings()));
    ensureYellow();
    InputOutput[] inout = new InputOutput[numFields];
    for (int i = 0; i < numFields; i++) {
        String input;
        Matcher matcher;
        do {
            // In Lucene 4.10, a bug was fixed in StandardTokenizer which was causing breaks on complex characters.
            // The bug was fixed without backcompat Version handling, so testing between >=4.10 vs <= 4.9 can
            // cause differences when the random string generated contains these complex characters. To mitigate
            // the problem, we skip any strings containing these characters.
            // TODO: only skip strings containing complex chars when comparing against ES <= 1.3.x
            input = TestUtil.randomAnalysisString(getRandom(), 100, false);
            matcher = complexUnicodeChars.matcher(input);
        } while (matcher.find());

        AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", input).setField("field_" + i)
                .get();
        inout[i] = new InputOutput(test, input, "field_" + i);
    }

    logClusterState();
    boolean upgraded;
    do {
        logClusterState();
        upgraded = backwardsCluster().upgradeOneNode();
        ensureYellow();
    } while (upgraded);

    for (int i = 0; i < inout.length; i++) {
        InputOutput inputOutput = inout[i];
        AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", inputOutput.input)
                .setField(inputOutput.field).get();
        List<AnalyzeResponse.AnalyzeToken> tokens = test.getTokens();
        List<AnalyzeResponse.AnalyzeToken> expectedTokens = inputOutput.response.getTokens();
        assertThat(
                "size mismatch field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: "
                        + BaseTokenStreamTestCase.escape(inputOutput.input),
                expectedTokens.size(), equalTo(tokens.size()));
        for (int j = 0; j < tokens.size(); j++) {
            String msg = "failed for term: " + expectedTokens.get(j).getTerm() + " field: " + fields[i * 2]
                    + " analyzer: " + fields[i * 2 + 1] + " input: "
                    + BaseTokenStreamTestCase.escape(inputOutput.input);
            assertThat(msg, BaseTokenStreamTestCase.escape(expectedTokens.get(j).getTerm()),
                    equalTo(BaseTokenStreamTestCase.escape(tokens.get(j).getTerm())));
            assertThat(msg, expectedTokens.get(j).getPosition(), equalTo(tokens.get(j).getPosition()));
            assertThat(msg, expectedTokens.get(j).getStartOffset(), equalTo(tokens.get(j).getStartOffset()));
            assertThat(msg, expectedTokens.get(j).getEndOffset(), equalTo(tokens.get(j).getEndOffset()));
            assertThat(msg, expectedTokens.get(j).getType(), equalTo(tokens.get(j).getType()));
        }
    }
}

From source file:org.elasticsearch.bwcompat.BasicAnalysisBackwardCompatibilityTests.java

License:Apache License

/**
 * Simple upgrade test for analyzers to make sure they analyze to the same tokens after upgrade
 * TODO we need this for random tokenizers / tokenfilters as well
 *//*  w w  w  .  j av  a2s .  c  om*/
@Test
public void testAnalyzerTokensAfterUpgrade() throws IOException, ExecutionException, InterruptedException {
    int numFields = randomIntBetween(PreBuiltAnalyzers.values().length, PreBuiltAnalyzers.values().length * 10);
    String[] fields = new String[numFields * 2];
    int fieldId = 0;
    for (int i = 0; i < fields.length; i++) {
        fields[i++] = "field_" + fieldId++;
        String analyzer = randomAnalyzer();
        fields[i] = "type=string,analyzer=" + analyzer;
    }
    assertAcked(prepareCreate("test").addMapping("type", fields).setSettings(indexSettings()));
    ensureYellow();
    InputOutput[] inout = new InputOutput[numFields];
    for (int i = 0; i < numFields; i++) {
        String input;
        Matcher matcher;
        do {
            // In Lucene 4.10, a bug was fixed in StandardTokenizer which was causing breaks on complex characters.
            // The bug was fixed without backcompat Version handling, so testing between >=4.10 vs <= 4.9 can
            // cause differences when the random string generated contains these complex characters. To mitigate
            // the problem, we skip any strings containing these characters.
            // TODO: only skip strings containing complex chars when comparing against ES <= 1.3.x
            input = TestUtil.randomAnalysisString(getRandom(), 100, false);
            matcher = complexUnicodeChars.matcher(input);
        } while (matcher.find());

        AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", input).setField("field_" + i)
                .get();
        inout[i] = new InputOutput(test, input, "field_" + i);
    }

    logClusterState();
    boolean upgraded;
    do {
        logClusterState();
        upgraded = backwardsCluster().upgradeOneNode();
        ensureYellow();
    } while (upgraded);

    for (int i = 0; i < inout.length; i++) {
        InputOutput inputOutput = inout[i];
        AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", inputOutput.input)
                .setField(inputOutput.field).get();
        List<AnalyzeResponse.AnalyzeToken> tokens = test.getTokens();
        List<AnalyzeResponse.AnalyzeToken> expectedTokens = inputOutput.response.getTokens();
        assertThat(
                "size mismatch field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: "
                        + BaseTokenStreamTestCase.escape(inputOutput.input),
                expectedTokens.size(), equalTo(tokens.size()));
        for (int j = 0; j < tokens.size(); j++) {
            String msg = "failed for term: " + expectedTokens.get(j).getTerm() + " field: " + fields[i * 2]
                    + " analyzer: " + fields[i * 2 + 1] + " input: "
                    + BaseTokenStreamTestCase.escape(inputOutput.input);
            assertThat(msg, BaseTokenStreamTestCase.escape(expectedTokens.get(j).getTerm()),
                    equalTo(BaseTokenStreamTestCase.escape(tokens.get(j).getTerm())));
            assertThat(msg, expectedTokens.get(j).getPosition(), equalTo(tokens.get(j).getPosition()));
            assertThat(msg, expectedTokens.get(j).getStartOffset(), equalTo(tokens.get(j).getStartOffset()));
            assertThat(msg, expectedTokens.get(j).getEndOffset(), equalTo(tokens.get(j).getEndOffset()));
            assertThat(msg, expectedTokens.get(j).getType(), equalTo(tokens.get(j).getType()));
        }
    }
}