Java String Normalize normaliseConll(String input)

Here you can find the source of normaliseConll(String input)

Description

Remove punctuation, quotation marks, and brackets, from CoNLL input, as they are discarded from the PLTAG parser

License

Open Source License

Parameter

Parameter Description
input a parameter

Declaration

public static String normaliseConll(String input) 

Method Source Code

//package com.java2s;
/* //from   w w  w.  ja v a 2 s  .co m
 * Copyright (C) 2015 ikonstas
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.util.ArrayList;
import java.util.Arrays;

import java.util.Iterator;
import java.util.List;

public class Main {
    /**
     * Remove punctuation, quotation marks, and brackets, from CoNLL input, as
     * they are discarded from the PLTAG parser
     *
     * @param input
     * @return
     */
    public static String normaliseConll(String input) {
        List<String> tokens = unpackConllSentence(input);
        for (Iterator<String> iter = tokens.iterator(); iter.hasNext();) {
            String token = iter.next();
            String word = token.split("\t")[1];
            if (word.equals("``") || word.equals("`") || word.equals("''") || word.equals("{") || word.equals("}")
                    || word.equals("(") || word.equals(")")) {
                iter.remove();
            }
        }

        String finalToken = tokens.get(tokens.size() - 1);
        String finalWord = finalToken.split("\t")[1];
        while (finalWord.matches("\\p{Punct}") && !finalWord.equals("%") && !finalWord.equals(":")
                && !finalWord.equals(",")) {
            tokens.remove(tokens.size() - 1);
            finalToken = tokens.get(tokens.size() - 1);
            finalWord = finalToken.split("\t")[1];
        }
        return repackConllSentence(tokens);
    }

    public static List<String> unpackConllSentence(String input) {
        return unpack(input, "\n");
    }

    public static String repackConllSentence(List<String> input) {
        return repack(input, "\n");
    }

    public static List<String> unpack(String input, String delimiter) {
        List<String> list = new ArrayList<String>();
        list.addAll(Arrays.asList(input.split(delimiter)));
        return list;
    }

    public static String repack(List<String> input, String delimiter) {
        StringBuilder str = new StringBuilder(input.get(0));
        for (int i = 1; i < input.size(); i++) {
            str.append(delimiter).append(input.get(i));
        }
        return str.toString();
    }

    public static List<Integer> asList(int[] ar) {
        List<Integer> list = new ArrayList<Integer>(ar.length);
        for (int a : ar) {
            list.add(a);
        }
        return list;
    }
}

Related

  1. normaliseUnicode(String unicodeText, char[] mappings)
  2. normalize(final String input)
  3. normalize(final String s)
  4. normalize(final String s)