Example usage for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter foldToASCII

List of usage examples for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter foldToASCII

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter foldToASCII.

Prototype

public static final int foldToASCII(char input[], int inputPos, char output[], int outputPos, int length) 

Source Link

Document

Converts characters above ASCII to their ASCII equivalents.

Usage

From source file:com.jaxio.jpa.querybyexample.DefaultLuceneQueryBuilder.java

License:Apache License

/**
 * Apply same filtering as "custom" analyzer. Lowercase is done by QueryParser for fuzzy search.
 *
 * @param word word/*from ww w .j ava 2  s.  com*/
 * @return word escaped
 */
private String escapeForFuzzy(String word) {
    int length = word.length();
    char[] tmp = new char[length * 4];
    length = ASCIIFoldingFilter.foldToASCII(word.toCharArray(), 0, tmp, 0, length);
    return new String(tmp, 0, length);
}

From source file:fr.openwide.core.spring.util.StringUtils.java

License:Apache License

/**
 * Supprime les accents d'une chane de caractres.
 * //ww w.  j  av  a2s. co  m
 * @param text chane  nettoyer
 * @return chane sans accent
 * @see org.apache.lucene.analysis.ASCIIFoldingFilter
 */
public static String removeAccents(String text) {
    if (text == null) {
        return text;
    }

    int length = text.length();
    char[] input = text.toCharArray();
    char[] output = new char[256];

    // Worst-case length required:
    final int maxSizeNeeded = 4 * length;

    if (output.length < maxSizeNeeded) {
        output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)];
    }

    int outputPos = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, length);

    return new String(output, 0, outputPos);
}

From source file:nl.knaw.huygens.analysis.lucene.DiacriticsFilter.java

License:Open Source License

public static String convert(String s) {
    char[] input = s.toCharArray();
    char[] output = new char[2 * input.length];
    int pos = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, input.length);
    return new String(output, 0, pos);
}

From source file:org.apache.jena.query.text.filter.SelectiveFoldingFilter.java

License:Apache License

@Override
public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
        final char[] buffer = termAtt.buffer();
        final int length = termAtt.length();
        // prepare the output char array, adapted from ASCIIFoldingFilter
        final int maxSizeNeeded = 4 * length;
        char[] output = new char[ArrayUtil.oversize(maxSizeNeeded, Character.BYTES)];
        for (int i = 0; i < length; ++i) {
            final char c = buffer[i];
            if (c >= '\u0080' && !whitelisted.contains(c)) {
                // here we are using the method that will iterate always over a list with a
                // single char
                ASCIIFoldingFilter.foldToASCII(buffer, i, output, i, 1);
            } else {
                output[i] = c;//  w w w . j  av a2 s  . c  o m
            }
        }
        termAtt.copyBuffer(output, 0, length);
        return true;
    }
    return false;
}

From source file:org.apache.unomi.persistence.elasticsearch.conditions.ConditionContextHelper.java

License:Apache License

public static String foldToASCII(String s) {
    if (s != null) {
        s = s.toLowerCase();/*from   ww w .java2 s. co  m*/
        int maxSizeNeeded = 4 * s.length();
        char[] output = new char[ArrayUtil.oversize(maxSizeNeeded, 2)];
        int length = ASCIIFoldingFilter.foldToASCII(s.toCharArray(), 0, output, 0, s.length());
        return new String(output, 0, length);
    }
    return null;
}

From source file:org.t3as.metamap.MetaMap.java

License:Open Source License

/**
 * Takes a Unicode string and tries to decompose non-7bit-ascii (Unicode Basic Latin) characters into 7bit ascii.
 * For example, the string '' is turned into 'aaaooo'.
 * Note that it doesn't always succeed for some of the much more complicated characters (e.g. '').
 * Occasionally some complicated characters end up as two characters when the ASCIIFoldingFilter is used...
 * Perhaps we want to adopt this library:
 * http://www.ippatsuman.com/projects/junidecode/
 *//*from   w  w  w.  j av  a  2  s.c o m*/
public static String decomposeToAscii(final String s) {
    /* pure java version, doesn't work all the time:
    String normalized = Normalizer.normalize(s, Normalizer.Form.NFD);
    return normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    */

    // this works on more cases
    final char[] input = new char[s.length()];
    s.getChars(0, s.length(), input, 0);
    final char[] output = new char[input.length * 4];
    final int numChars = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, input.length);

    // now remove anything not in the printable US-ASCII range, but keep newlines
    final StringBuilder sb = new StringBuilder(numChars);
    for (int i = 0; i < numChars; i++) {
        final char c = output[i];
        // printable US-ASCII is from 32 to 126
        if ((32 <= c && c <= 126) || '\n' == c)
            sb.append(c);
    }

    return sb.toString();
}