org.apache.solr.schema.TestCollationField.java Source code

Introduction

Here is the source code for org.apache.solr.schema.TestCollationField.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.schema;

import java.io.File;
import java.io.FileOutputStream;
import java.text.Collator;
import java.text.RuleBasedCollator;
import java.util.Locale;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;

/**
 * Tests {@link CollationField} with TermQueries, RangeQueries, and sort order.
 */
public class TestCollationField extends SolrTestCaseJ4 {

    @BeforeClass
    public static void beforeClass() throws Exception {
        String home = setupSolrHome();
        initCore("solrconfig.xml", "schema.xml", home);
        // add some docs
        assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
        assertU(adoc("id", "2", "text", "I WLL USE TURKSH CASING"));
        assertU(adoc("id", "3", "text", " will use turkish casng"));
        assertU(adoc("id", "4", "text", "Tne"));
        assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKSH CASING"));
        assertU(adoc("id", "6", "text", ""));
        assertU(adoc("id", "7", "text", "Tone"));
        assertU(adoc("id", "8", "text", "Testing"));
        assertU(adoc("id", "9", "text", "testing"));
        assertU(adoc("id", "10", "text", "toene"));
        assertU(adoc("id", "11", "text", "Tzne"));
        assertU(adoc("id", "12", "text", "\u0698\u0698"));
        assertU(commit());
    }

    /**
     * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
     * These are largish files, and jvm-specific (as our documentation says, you should always
     * look out for jvm differences with collation).
     * So it's preferable to create this file on-the-fly.
     */
    public static String setupSolrHome() throws Exception {
        // make a solr home underneath the test's TEMP_DIR
        File tmpFile = createTempDir("collation1").toFile();

        // make data and conf dirs
        new File(tmpFile, "data").mkdir();
        File confDir = new File(tmpFile + "/collection1", "conf");
        confDir.mkdirs();

        // copy over configuration files
        FileUtils.copyFile(getFile("solr/collection1/conf/solrconfig-basic.xml"),
                new File(confDir, "solrconfig.xml"));
        FileUtils.copyFile(getFile("solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml"),
                new File(confDir, "solrconfig.snippet.randomindexconfig.xml"));
        FileUtils.copyFile(getFile("solr/collection1/conf/schema-collate.xml"), new File(confDir, "schema.xml"));

        // generate custom collation rules (DIN 5007-2), saving to customrules.dat
        RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new Locale("de", "DE"));

        String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308" + "& oe , o\u0308 & OE , O\u0308"
                + "& ue , u\u0308 & UE , u\u0308";

        RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
        String tailoredRules = tailoredCollator.getRules();
        FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
        IOUtils.write(tailoredRules, os, "UTF-8");
        os.close();

        return tmpFile.getAbsolutePath();
    }

    /** 
     * Test termquery with german DIN 5007-1 primary strength.
     * In this case,  is equivalent to o (but not oe) 
     */
    public void testBasicTermQuery() {
        assertQ("Collated TQ: ", req("fl", "id", "q", "sort_de:tone", "sort", "id asc"), "//*[@numFound='2']",
                "//result/doc[1]/int[@name='id'][.=4]", "//result/doc[2]/int[@name='id'][.=7]");
    }

    /** 
     * Test rangequery again with the DIN 5007-1 collator.
     * We do a range query of tone .. tp, in binary order this
     * would retrieve nothing due to case and accent differences.
     */
    public void testBasicRangeQuery() {
        assertQ("Collated RangeQ: ", req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc"),
                "//*[@numFound='2']", "//result/doc[1]/int[@name='id'][.=4]",
                "//result/doc[2]/int[@name='id'][.=7]");
    }

    /** 
     * Test sort with a danish collator.  is ordered after z
     */
    public void testBasicSort() {
        assertQ("Collated Sort: ", req("fl", "id", "q", "sort_da:[tz TO tz]", "sort", "sort_da asc"),
                "//*[@numFound='2']", "//result/doc[1]/int[@name='id'][.=11]",
                "//result/doc[2]/int[@name='id'][.=4]");
    }

    /** 
     * Test sort with an arabic collator. U+0633 is ordered after U+0698.
     * With a binary collator, the range would also return nothing.
     */
    public void testArabicSort() {
        assertQ("Collated Sort: ", req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc"),
                "//*[@numFound='2']", "//result/doc[1]/int[@name='id'][.=12]",
                "//result/doc[2]/int[@name='id'][.=1]");
    }

    /** 
     * Test rangequery again with an Arabic collator.
     * Binary order would normally order U+0633 in this range.
     */
    public void testNegativeRangeQuery() {
        assertQ("Collated RangeQ: ", req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc"),
                "//*[@numFound='0']");
    }

    /**
     * Test canonical decomposition with turkish primary strength. 
     * With this sort order,  is the uppercase form of i, and I is the uppercase form of .
     * We index a decomposed form of .
     */
    public void testCanonicalDecomposition() {
        assertQ("Collated TQ: ",
                req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casng\"", "sort", "id asc"),
                "//*[@numFound='3']", "//result/doc[1]/int[@name='id'][.=2]",
                "//result/doc[2]/int[@name='id'][.=3]", "//result/doc[3]/int[@name='id'][.=5]");
    }

    /**
     * Test full decomposition with chinese identical strength. 
     * The full width form "" is treated identical to "Testing"
     */
    public void testFullDecomposition() {
        assertQ("Collated TQ: ", req("fl", "id", "q", "sort_zh_full:Testing", "sort", "id asc"),
                "//*[@numFound='2']", "//result/doc[1]/int[@name='id'][.=6]",
                "//result/doc[2]/int[@name='id'][.=8]");
    }

    /** 
     * Test termquery with custom collator (DIN 5007-2).
     * In this case,  is equivalent to oe (but not o) 
     */
    public void testCustomCollation() {
        assertQ("Collated TQ: ", req("fl", "id", "q", "sort_custom:toene", "sort", "id asc"), "//*[@numFound='2']",
                "//result/doc[1]/int[@name='id'][.=4]", "//result/doc[2]/int[@name='id'][.=10]");
    }
}