Example usage for java.text BreakIterator next

List of usage examples for java.text BreakIterator next

Introduction

In this page you can find the example usage for java.text BreakIterator next.

Prototype

public abstract int next();

Source Link

Document

Returns the boundary following the current boundary.

Usage

From source file:Main.java

public TextBoundaryFrame() {
    getContentPane().add(new JScrollPane(outputText));

    Locale currentLocale = Locale.getDefault();
    BreakIterator currentBreakIterator = null;
    currentBreakIterator = BreakIterator.getCharacterInstance(currentLocale);

    String text = "The quick, brown fox jump-ed\n" + "over the lazy \"dog.\" And then...what happened?";
    currentBreakIterator.setText(text);//from w ww.  ja va2 s.c o  m
    outputText.setText("");

    int from = currentBreakIterator.first();
    int to;
    while ((to = currentBreakIterator.next()) != BreakIterator.DONE) {
        outputText.append(text.substring(from, to) + "|");
        from = to;
    }
    outputText.append(text.substring(from));
}

From source file:be.idamediafoundry.sofa.livecycle.dsc.util.AbstractQDoxComponentInfoExtractor.java

final protected String getFirstSentence(String text) {
    String result = text;//from  ww w.  ja  v  a 2s . com
    if (text != null) {
        BreakIterator iterator = BreakIterator.getSentenceInstance();
        iterator.setText(text);
        int start = iterator.first();
        int end = iterator.next();
        if (end != BreakIterator.DONE) {
            result = text.substring(start, end).trim();
        }
    }
    return result;
}

From source file:com.norconex.importer.handler.tagger.impl.TextStatisticsTagger.java

@Override
protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed)
        throws ImporterHandlerException {
    long charCount = 0;
    long wordCharCount = 0;
    long wordCount = 0;
    long sentenceCount = 0;
    long sentenceCharCount = 0;
    long paragraphCount = 0;

    //TODO make this more efficient, by doing all this in one pass.
    LineIterator it = IOUtils.lineIterator(input);
    while (it.hasNext()) {
        String line = it.nextLine().trim();
        if (StringUtils.isBlank(line)) {
            continue;
        }/*from   w ww  .j av  a  2s.  c om*/

        // Paragraph
        paragraphCount++;

        // Character
        charCount += line.length();

        // Word
        Matcher matcher = PATTERN_WORD.matcher(line);
        while (matcher.find()) {
            int wordLength = matcher.end() - matcher.start();
            wordCount++;
            wordCharCount += wordLength;
        }

        // Sentence
        BreakIterator boundary = BreakIterator.getSentenceInstance();
        boundary.setText(line);
        int start = boundary.first();
        for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
            sentenceCharCount += (end - start);
            sentenceCount++;
        }
    }

    String field = StringUtils.EMPTY;
    if (StringUtils.isNotBlank(fieldName)) {
        field = fieldName.trim() + ".";
    }

    //--- Add fields ---
    metadata.addLong("document.stat." + field + "characterCount", charCount);
    metadata.addLong("document.stat." + field + "wordCount", wordCount);
    metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount);
    metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount);
    metadata.addString("document.stat." + field + "averageWordCharacterCount",
            divide(wordCharCount, wordCount));
    metadata.addString("document.stat." + field + "averageSentenceCharacterCount",
            divide(sentenceCharCount, sentenceCount));
    metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount));
    metadata.addString("document.stat." + field + "averageParagraphCharacterCount",
            divide(charCount, paragraphCount));
    metadata.addString("document.stat." + field + "averageParagraphSentenceCount",
            divide(sentenceCount, paragraphCount));
    metadata.addString("document.stat." + field + "averageParagraphWordCount",
            divide(wordCount, paragraphCount));

}

From source file:com.glaf.core.util.StringTools.java

public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }//  w  w w .ja  v  a2 s  .  c o m

    List<String> wordList = new java.util.ArrayList<String>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:IteratorTest.java

protected void refreshDisplay() {
    int startIndex, nextIndex;
    Vector items = new Vector();
    String msgText = textArea.getText();
    Locale locale = (Locale) (localeButton.getSelectedItem());
    BreakIterator iterator = null;
    if (charButton.isSelected()) {
        iterator = BreakIterator.getCharacterInstance(locale);
    } else if (wordButton.isSelected()) {
        iterator = BreakIterator.getWordInstance(locale);
    } else if (lineButton.isSelected()) {
        iterator = BreakIterator.getLineInstance(locale);
    } else if (sentButton.isSelected()) {
        iterator = BreakIterator.getSentenceInstance(locale);
    }//w  w w. j a  va2s  . c  o m
    iterator.setText(msgText);
    startIndex = iterator.first();
    nextIndex = iterator.next();

    while (nextIndex != BreakIterator.DONE) {
        items.addElement(msgText.substring(startIndex, nextIndex));
        startIndex = nextIndex;
        nextIndex = iterator.next();
    }
    itemList.setListData(items);
}

From source file:com.redhat.rcm.version.Cli.java

private static void printTextLine(final String line, final String indent, final int max, final PrintWriter pw) {
    final String fmt = "%s%-" + max + "s\n";

    final List<String> lines = new ArrayList<String>();

    final BreakIterator iter = BreakIterator.getLineInstance();
    iter.setText(line);//from   www .j a  va 2  s  .  c  om

    int start = iter.first();
    int end = BreakIterator.DONE;
    final StringBuilder currentLine = new StringBuilder();
    String seg;
    while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) {
        seg = line.substring(start, end);
        if (currentLine.length() + seg.length() > max) {
            lines.add(currentLine.toString());
            currentLine.setLength(0);
        }

        currentLine.append(seg);
        start = end;
    }

    if (currentLine.length() > 0) {
        lines.add(currentLine.toString());
    }

    for (final String ln : lines) {
        pw.printf(fmt, indent, ln);
    }
}

From source file:com.redhat.rcm.version.Cli.java

private static void printKVLine(final String key, final String value, final String fmt, final int valMax,
        final PrintWriter pw) {
    final List<String> lines = new ArrayList<String>();

    final BreakIterator iter = BreakIterator.getLineInstance();
    iter.setText(value);/*from w w w  .j a  va  2  s .  c o m*/

    int start = iter.first();
    int end = BreakIterator.DONE;
    final StringBuilder currentLine = new StringBuilder();
    String seg;
    while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) {
        seg = value.substring(start, end);
        if (currentLine.length() + seg.length() > valMax) {
            lines.add(currentLine.toString());
            currentLine.setLength(0);
        }

        currentLine.append(seg);
        start = end;
    }

    if (currentLine.length() > 0) {
        lines.add(currentLine.toString());
    }

    pw.printf(fmt, key, lines.isEmpty() ? "" : lines.get(0));
    if (lines.size() > 1) {
        for (int i = 1; i < lines.size(); i++) {
            // blank string to serve for indentation in format with two fields.
            pw.printf(fmt, "", lines.get(i));
        }
    }
}

From source file:Utils.java

/**
 * Wrap multi-line strings (and get the individual lines).
 * /*from  w w  w. j av  a 2 s . c  o  m*/
 * @param original
 *          the original string to wrap
 * @param width
 *          the maximum width of lines
 * @param breakIterator
 *          breaks original to chars, words, sentences, depending on what
 *          instance you provide.
 * @param removeNewLines
 *          if <code>true</code>, any newlines in the original string are
 *          ignored
 * @return the lines after wrapping
 */
public static String[] wrapStringToArray(String original, int width, BreakIterator breakIterator,
        boolean removeNewLines) {
    if (original.length() == 0) {
        return new String[] { original };
    }

    String[] workingSet;

    // substitute original newlines with spaces,
    // remove newlines from head and tail
    if (removeNewLines) {
        original = trimString(original);
        original = original.replace('\n', ' ');
        workingSet = new String[] { original };
    } else {
        StringTokenizer tokens = new StringTokenizer(original, "\n"); // NOI18N
        int len = tokens.countTokens();
        workingSet = new String[len];

        for (int i = 0; i < len; i++) {
            workingSet[i] = tokens.nextToken();
        }
    }

    if (width < 1) {
        width = 1;
    }

    if (original.length() <= width) {
        return workingSet;
    }

    widthcheck: {
        boolean ok = true;

        for (int i = 0; i < workingSet.length; i++) {
            ok = ok && (workingSet[i].length() < width);

            if (!ok) {
                break widthcheck;
            }
        }

        return workingSet;
    }

    java.util.ArrayList<String> lines = new java.util.ArrayList<String>();

    int lineStart = 0; // the position of start of currently processed line in
                       // the original string

    for (int i = 0; i < workingSet.length; i++) {
        if (workingSet[i].length() < width) {
            lines.add(workingSet[i]);
        } else {
            breakIterator.setText(workingSet[i]);

            int nextStart = breakIterator.next();
            int prevStart = 0;

            do {
                while (((nextStart - lineStart) < width) && (nextStart != BreakIterator.DONE)) {
                    prevStart = nextStart;
                    nextStart = breakIterator.next();
                }

                if (nextStart == BreakIterator.DONE) {
                    nextStart = prevStart = workingSet[i].length();
                }

                if (prevStart == 0) {
                    prevStart = nextStart;
                }

                lines.add(workingSet[i].substring(lineStart, prevStart));

                lineStart = prevStart;
                prevStart = 0;
            } while (lineStart < workingSet[i].length());

            lineStart = 0;
        }
    }

    String[] s = new String[lines.size()];

    return (String[]) lines.toArray(s);
}

From source file:com.tao.realweb.util.StringUtil.java

/**
 * Converts a line of text into an array of lower case words using a
 * BreakIterator.wordInstance().<p>
 *
 * This method is under the Jive Open Source Software License and was
 * written by Mark Imbriaco./*  www.  j  av a  2s  .c  o m*/
 *
 * @param text a String of text to convert into an array of words
 * @return text broken up into an array of words.
 */
public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }

    List<String> wordList = new ArrayList<String>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        // Remove characters that are not needed.
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:org.cloudgraph.examples.test.model.NLPWikiParseTest.java

private void parse(StringBuilder buf) throws IOException {
    BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);

    String text = buf.toString();
    int counter = 0;
    iterator.setText(text);//w w w  . j  a  v a 2s .  c  o m

    int lastIndex = iterator.first();
    while (lastIndex != BreakIterator.DONE) {
        int firstIndex = lastIndex;
        lastIndex = iterator.next();

        if (lastIndex != BreakIterator.DONE) {
            String sentence = text.substring(firstIndex, lastIndex);
            long before = System.currentTimeMillis();
            //parse(sentence);
            long after = System.currentTimeMillis();
            log.info("time4: " + String.valueOf(after - before) + ": " + sentence);
            counter++;
        }
    }

}