List of usage examples for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter foldToASCII
public static final int foldToASCII(char input[], int inputPos, char output[], int outputPos, int length)
From source file:com.jaxio.jpa.querybyexample.DefaultLuceneQueryBuilder.java
License:Apache License
/** * Apply same filtering as "custom" analyzer. Lowercase is done by QueryParser for fuzzy search. * * @param word word/*from ww w .j ava 2 s. com*/ * @return word escaped */ private String escapeForFuzzy(String word) { int length = word.length(); char[] tmp = new char[length * 4]; length = ASCIIFoldingFilter.foldToASCII(word.toCharArray(), 0, tmp, 0, length); return new String(tmp, 0, length); }
From source file:fr.openwide.core.spring.util.StringUtils.java
License:Apache License
/** * Supprime les accents d'une chane de caractres. * //ww w. j av a2s. co m * @param text chane nettoyer * @return chane sans accent * @see org.apache.lucene.analysis.ASCIIFoldingFilter */ public static String removeAccents(String text) { if (text == null) { return text; } int length = text.length(); char[] input = text.toCharArray(); char[] output = new char[256]; // Worst-case length required: final int maxSizeNeeded = 4 * length; if (output.length < maxSizeNeeded) { output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)]; } int outputPos = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, length); return new String(output, 0, outputPos); }
From source file:nl.knaw.huygens.analysis.lucene.DiacriticsFilter.java
License:Open Source License
public static String convert(String s) { char[] input = s.toCharArray(); char[] output = new char[2 * input.length]; int pos = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, input.length); return new String(output, 0, pos); }
From source file:org.apache.jena.query.text.filter.SelectiveFoldingFilter.java
License:Apache License
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { final char[] buffer = termAtt.buffer(); final int length = termAtt.length(); // prepare the output char array, adapted from ASCIIFoldingFilter final int maxSizeNeeded = 4 * length; char[] output = new char[ArrayUtil.oversize(maxSizeNeeded, Character.BYTES)]; for (int i = 0; i < length; ++i) { final char c = buffer[i]; if (c >= '\u0080' && !whitelisted.contains(c)) { // here we are using the method that will iterate always over a list with a // single char ASCIIFoldingFilter.foldToASCII(buffer, i, output, i, 1); } else { output[i] = c;// w w w . j av a2 s . c o m } } termAtt.copyBuffer(output, 0, length); return true; } return false; }
From source file:org.apache.unomi.persistence.elasticsearch.conditions.ConditionContextHelper.java
License:Apache License
public static String foldToASCII(String s) { if (s != null) { s = s.toLowerCase();/*from ww w .java2 s. co m*/ int maxSizeNeeded = 4 * s.length(); char[] output = new char[ArrayUtil.oversize(maxSizeNeeded, 2)]; int length = ASCIIFoldingFilter.foldToASCII(s.toCharArray(), 0, output, 0, s.length()); return new String(output, 0, length); } return null; }
From source file:org.t3as.metamap.MetaMap.java
License:Open Source License
/** * Takes a Unicode string and tries to decompose non-7bit-ascii (Unicode Basic Latin) characters into 7bit ascii. * For example, the string '' is turned into 'aaaooo'. * Note that it doesn't always succeed for some of the much more complicated characters (e.g. ''). * Occasionally some complicated characters end up as two characters when the ASCIIFoldingFilter is used... * Perhaps we want to adopt this library: * http://www.ippatsuman.com/projects/junidecode/ *//*from w w w. j av a 2 s.c o m*/ public static String decomposeToAscii(final String s) { /* pure java version, doesn't work all the time: String normalized = Normalizer.normalize(s, Normalizer.Form.NFD); return normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); */ // this works on more cases final char[] input = new char[s.length()]; s.getChars(0, s.length(), input, 0); final char[] output = new char[input.length * 4]; final int numChars = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, input.length); // now remove anything not in the printable US-ASCII range, but keep newlines final StringBuilder sb = new StringBuilder(numChars); for (int i = 0; i < numChars; i++) { final char c = output[i]; // printable US-ASCII is from 32 to 126 if ((32 <= c && c <= 126) || '\n' == c) sb.append(c); } return sb.toString(); }