eu.interedition.collatex.simple.SimplePatternTokenizer.java Source code

Java tutorial

Introduction

Here is the source code for eu.interedition.collatex.simple.SimplePatternTokenizer.java

Source

/*
 * Copyright (c) 2013 The Interedition Development Group.
 *
 * This file is part of CollateX.
 *
 * CollateX is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CollateX is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CollateX.  If not, see <http://www.gnu.org/licenses/>.
 */

package eu.interedition.collatex.simple;

import com.google.common.base.Function;
import com.google.common.collect.Lists;

import javax.annotation.Nullable;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author <a href="http://gregor.middell.net/" title="Homepage">Gregor Middell</a>
 * @author Ronald Haentjens Dekker
 */
public class SimplePatternTokenizer implements Function<String, Iterable<String>> {

    private final Pattern pattern;

    public SimplePatternTokenizer(Pattern pattern) {
        this.pattern = pattern;
    }

    @Override
    public Iterable<String> apply(@Nullable String input) {
        final Matcher matcher = pattern.matcher(input);
        final List<String> tokens = Lists.newLinkedList();
        while (matcher.find()) {
            tokens.add(input.substring(matcher.start(), matcher.end()));
        }
        return tokens;
    }

    public static final SimplePatternTokenizer BY_WHITESPACE = new SimplePatternTokenizer(
            Pattern.compile("\\s*?\\S+\\s*]"));

    static final String PUNCT = Pattern.quote(".?!,;:");

    public static final SimplePatternTokenizer BY_WS_AND_PUNCT = new SimplePatternTokenizer(
            Pattern.compile("[\\s" + PUNCT + "]*?[^\\s" + PUNCT + "]+[\\s" + PUNCT + "]*"));

    public static final SimplePatternTokenizer BY_WS_OR_PUNCT = new SimplePatternTokenizer(
            Pattern.compile("[" + PUNCT + "]+[\\s]*|[^" + PUNCT + "\\s]+[\\s]*"));
}