Example usage for java.lang Character getType

List of usage examples for java.lang Character getType

Introduction

In this page you can find the example usage for java.lang Character getType.

Prototype

public static int getType(int codePoint) 

Source Link

Document

Returns a value indicating a character's general category.

Usage

From source file:com.vuze.android.remote.adapter.TorrentListAdapter.java

private static boolean isAlphabetic(int c) {
    // Seems to return symbolic languages
    //      if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.KITKAT) {
    //         return Character.isAlphabetic(c);
    //      }//from www .  j  a  va 2 s  . com
    if (!Character.isLetter(c)) {
        return false;
    }
    int type = Character.getType(c);
    return type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER;
    // Simple, but doesn't include letters with hats on them ;)
    //return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
}

From source file:com.vuze.android.remote.adapter.TorrentListAdapter.java

private static boolean isStandardPuncuation(int c) {
    int type = Character.getType(c);
    return type == Character.START_PUNCTUATION || type == Character.END_PUNCTUATION
            || type == Character.OTHER_PUNCTUATION;
}

From source file:org.opendatakit.services.preferences.fragments.ServerSettingsFragment.java

/**
 * Disallows carriage returns from user entry
 *
 * @return// www . j  a  v  a  2 s  .co m
 */
private InputFilter getReturnFilter() {
    InputFilter returnFilter = new InputFilter() {
        public CharSequence filter(CharSequence source, int start, int end, Spanned dest, int dstart,
                int dend) {
            for (int i = start; i < end; i++) {
                if (Character.getType((source.charAt(i))) == Character.CONTROL) {
                    return "";
                }
            }
            return null;
        }
    };
    return returnFilter;
}

From source file:org.apache.pdfbox.text.TextPosition.java

/**
 * @return True if the current character is a diacritic char.
 *///from www.  j  av  a 2 s .c  om
public boolean isDiacritic() {
    String text = this.getUnicode();
    if (text.length() != 1) {
        return false;
    }
    int type = Character.getType(text.charAt(0));
    return type == Character.NON_SPACING_MARK || type == Character.MODIFIER_SYMBOL
            || type == Character.MODIFIER_LETTER;

}

From source file:tufts.vue.ds.Field.java

private static boolean isCurrencySymbol(int c) {
    // checking '$' should be redundant
    return c == '$' || Character.getType(c) == Character.CURRENCY_SYMBOL;
}

From source file:gate.creole.tokeniser.SimpleTokeniser.java

/**
 * The method that does the actual tokenisation.
 *//*from   w ww  .ja va2s  .  c  o  m*/
@Override
public void execute() throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    //check the input
    if (document == null) {
        throw new ExecutionException("No document to tokenise!");
    }

    if (annotationSetName == null || annotationSetName.equals(""))
        annotationSet = document.getAnnotations();
    else
        annotationSet = document.getAnnotations(annotationSetName);

    fireStatusChanged("Tokenising " + document.getName() + "...");

    String content = document.getContent().toString();
    int length = content.length();
    int currentChar;
    int charsInCurrentCP = 1;

    DFSMState graphPosition = dInitialState;

    //the index of the first character of the token trying to be recognised
    int tokenStart = 0;

    DFSMState lastMatchingState = null;
    DFSMState nextState;
    String tokenString;
    int charIdx = 0;
    int oldCharIdx = 0;
    FeatureMap newTokenFm;

    while (charIdx < length) {
        currentChar = content.codePointAt(charIdx);
        // number of chars we have to advance after processing this code point.
        // 1 in the vast majority of cases, but 2 where the code point is a
        // supplementary character represented as a surrogate pair.
        charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1;

        //      Out.println(
        //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
        nextState = graphPosition.next(typeIds.get(new Integer(Character.getType(currentChar))).intValue());

        if (null != nextState) {
            graphPosition = nextState;
            if (graphPosition.isFinal()) {
                lastMatchingState = graphPosition;
            }
            charIdx += charsInCurrentCP;
        } else {//we have a match!
            newTokenFm = Factory.newFeatureMap();

            if (null == lastMatchingState) {
                // no rule matches this character, so create a single-char
                // DEFAULT_TOKEN annotation covering it and start again after it
                charIdx = tokenStart + charsInCurrentCP;
                tokenString = content.substring(tokenStart, charIdx);
                newTokenFm.put("type", "UNKNOWN");
                newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
                newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

                try {
                    annotationSet.add(new Long(tokenStart), new Long(charIdx), "DEFAULT_TOKEN", newTokenFm);
                } catch (InvalidOffsetException ioe) {
                    //This REALLY shouldn't happen!
                    ioe.printStackTrace(Err.getPrintWriter());
                }
                // Out.println("Default token: " + tokenStart +
                //             "->" + tokenStart + " :" + tokenString + ";");
            } else {
                // we've reached the end of a string that the FSM recognised
                tokenString = content.substring(tokenStart, charIdx);
                newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
                newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

                for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) {
                    newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
                            lastMatchingState.getTokenDesc()[i][1]);
                    //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
                    //                       lastMatchingState.getTokenDesc()[i][1]);
                }

                try {
                    annotationSet.add(new Long(tokenStart), new Long(charIdx),
                            lastMatchingState.getTokenDesc()[0][0], newTokenFm);
                } catch (InvalidOffsetException ioe) {
                    //This REALLY shouldn't happen!
                    throw new GateRuntimeException(ioe.toString());
                }

                // Out.println(lastMatchingState.getTokenDesc()[0][0] +
                //              ": " + tokenStart + "->" + lastMatch +
                //              " :" + tokenString + ";");
                //charIdx = lastMatch + 1;
            }

            // reset to initial state and start looking again from here
            lastMatchingState = null;
            graphPosition = dInitialState;
            tokenStart = charIdx;
        }

        if ((charIdx - oldCharIdx > 256)) {
            fireProgressChanged((100 * charIdx) / length);
            oldCharIdx = charIdx;
            if (isInterrupted())
                throw new ExecutionInterruptedException();
        }

    } // while(charIdx < length)

    if (null != lastMatchingState) {
        // we dropped off the end having found a match, annotate it
        tokenString = content.substring(tokenStart, charIdx);
        newTokenFm = Factory.newFeatureMap();
        newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
        newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

        for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) {
            newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]);
        }

        try {
            annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0],
                    newTokenFm);
        } catch (InvalidOffsetException ioe) {
            //This REALLY shouldn't happen!
            throw new GateRuntimeException(ioe.toString());
        }

    }

    reset();
    fireProcessFinished();
    fireStatusChanged("Tokenisation complete!");
}

From source file:marytts.util.string.StringUtils.java

/**
 * Determine whether the given codepoint is either a letter or
 * a modifier according to the Unicode standard. More precisely,
 * this returns true if codepoint belongs to one of the following categories
 * as defined at http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values:
 * <ul>//from ww w . j  av a2  s.  c  o m
 * <li>Lu   Letter, Uppercase</li>
 * <li>Ll  Letter, Lowercase</li>
 * <li>Lt  Letter, Titlecase</li>
 * <li>Lm  Letter, Modifier</li>
 * <li>Lo  Letter, Other</li>
 * <li>Mn  Mark, Nonspacing</li>
 * <li>Mc  Mark, Spacing Combining</li>
 * <li>Me  Mark, Enclosing</li>
 * </ul>
 * Whether a given character is associated with this category can be looked up
 * at http://unicode.org/Public/UNIDATA/UnicodeData.txt
 * @param codePoint the unicode codepoint as determined e.g. by String.codePointAt().
 * @return true if the above condition is met, false otherwise
 */
public static boolean isLetterOrModifier(int codePoint) {
    int type = Character.getType(codePoint);
    return type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER
            || type == Character.TITLECASE_LETTER || type == Character.MODIFIER_LETTER
            || type == Character.OTHER_LETTER || type == Character.NON_SPACING_MARK
            || type == Character.COMBINING_SPACING_MARK || type == Character.ENCLOSING_MARK;
}

From source file:org.opensextant.util.TextUtils.java

/**
 * Supports Phoneticizer utility from OpenSextant v1.x Remove diacritics
 * from a phrase/*from   w  ww.  j  av a  2 s.com*/
 * 
 * @param word
 *            text
 * @return scrubbed text
 */
public static String removeDiacritics(String word) {

    // first, fully decomposed all chars
    String tmpWord = Normalizer.normalize(word, Normalizer.Form.NFD);
    StringBuilder newWord = new StringBuilder();
    char[] chars = tmpWord.toCharArray();
    // now, discard any characters from one of the "Mark" categories.
    for (char c : chars) {
        if (Character.getType(c) != Character.NON_SPACING_MARK
                && Character.getType(c) != Character.COMBINING_SPACING_MARK
                && Character.getType(c) != Character.ENCLOSING_MARK) {
            newWord.append(c);
        }
    }
    return newWord.toString();
}

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

/**
 * Given a UTF code point, find the replacement codepoint
 * @param codepoint a UTF character/*from   w w w .j  a v a  2  s  . c  o  m*/
 * @return the replacement codepoint
 */
int getReplacement(int codepoint) {
    switch (Character.getType(codepoint)) {
    case Character.UPPERCASE_LETTER:
        return UPPPER_REPLACEMENT;
    case Character.LOWERCASE_LETTER:
        return LOWER_REPLACEMENT;
    case Character.TITLECASE_LETTER:
    case Character.MODIFIER_LETTER:
    case Character.OTHER_LETTER:
        return OTHER_LETTER_REPLACEMENT;
    case Character.NON_SPACING_MARK:
    case Character.ENCLOSING_MARK:
    case Character.COMBINING_SPACING_MARK:
        return MARK_REPLACEMENT;
    case Character.DECIMAL_DIGIT_NUMBER:
        return DIGIT_CP_REPLACEMENT;
    case Character.LETTER_NUMBER:
    case Character.OTHER_NUMBER:
        return OTHER_NUMBER_REPLACEMENT;
    case Character.SPACE_SEPARATOR:
    case Character.LINE_SEPARATOR:
    case Character.PARAGRAPH_SEPARATOR:
        return SEPARATOR_REPLACEMENT;
    case Character.MATH_SYMBOL:
    case Character.CURRENCY_SYMBOL:
    case Character.MODIFIER_SYMBOL:
    case Character.OTHER_SYMBOL:
        return SYMBOL_REPLACEMENT;
    case Character.DASH_PUNCTUATION:
    case Character.START_PUNCTUATION:
    case Character.END_PUNCTUATION:
    case Character.CONNECTOR_PUNCTUATION:
    case Character.OTHER_PUNCTUATION:
        return PUNCTUATION_REPLACEMENT;
    default:
        return OTHER_REPLACEMENT;
    }
}

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

/**
 * Mask the given stringified numeric value excluding the unmask range.
 * Non-digit characters are passed through on the assumption they are
 * markers (eg. one of ",.ef")./*from   w w  w.ja va 2 s. com*/
 * @param value the original value.
 */
String maskNumericString(final String value) {
    StringBuilder result = new StringBuilder();
    final int length = value.codePointCount(0, value.length());
    for (int c = 0; c < length; ++c) {
        int cp = value.codePointAt(c);
        if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
            result.appendCodePoint(cp);
        } else {
            result.appendCodePoint(DIGIT_CP_REPLACEMENT);
        }
    }
    return result.toString();
}