Example usage for edu.stanford.nlp.ling HasWord word

List of usage examples for edu.stanford.nlp.ling HasWord word

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling HasWord word.

Prototype

public String word();

Source Link

Document

Return the word value of the label (or null if none).

Usage

From source file:at.illecker.storm.commons.tokenizer.Tokenizer.java

License:Apache License

public static List<String> tokenize(String str, Type type) {
    // Step 1) Trim text
    str = str.trim();/* w  w w  .j  av a2s .  c om*/

    // Step 2) Replace Unicode symbols \u0000
    if (UnicodeUtils.containsUnicode(str)) {
        String replacedText = UnicodeUtils.replaceUnicodeSymbols(str);
        // LOG.info("Replaced Unicode symbols from '" + str + "' to '"
        // + replacedText + "'");
        if ((LOGGING) && (replacedText.equals(str))) {
            LOG.warn("Unicode symbols could not be replaced: '" + str + "'");
        }
        str = replacedText;
    }

    // Step 3) Replace HTML symbols &#[0-9];
    if (HtmlUtils.containsHtml(str)) {
        String replacedText = HtmlUtils.replaceHtmlSymbols(str);
        // LOG.info("Replaced HTML symbols from '" + text + "' to '"
        // + replacedText + "'");
        if ((LOGGING) && (replacedText.equals(str))) {
            LOG.warn("HTML symbols could not be replaced: '" + str + "'");
        }
        str = replacedText;
    }

    // Step 4) Tokenize
    List<String> tokenizedTokens = null;

    switch (type) {
    case REGEX_TOKENIZER:
        tokenizedTokens = new ArrayList<String>();
        Matcher m = RegexUtils.TOKENIZER_PATTERN.matcher(str);
        while (m.find()) {
            tokenizedTokens.add(m.group());
        }
        break;

    case ARK_TOKENIZER:
        tokenizedTokens = Twokenize.tokenize(str);
        break;

    case STANFORD_TOKENIZER:
        TokenizerFactory<Word> tokenizer = PTBTokenizerFactory.newTokenizerFactory();
        tokenizer.setOptions("ptb3Escaping=false");
        List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(str), tokenizer);
        // Convert sentences to List<String>
        tokenizedTokens = new ArrayList<String>();
        for (List<HasWord> sentence : sentences) {
            for (HasWord word : sentence) {
                tokenizedTokens.add(word.word());
            }
        }
        break;

    default:
        break;
    }

    return tokenizedTokens;
}

From source file:com.summarizer.Utilities.java

License:Apache License

public static String[] sentenceTokonizer(String entireDoc) {
    Reader reader = new StringReader(entireDoc);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new LinkedList<String>();
    Iterator<List<HasWord>> it = dp.iterator();

    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }/*from   w ww  . j  a v a  2s . c om*/
            sentenceSb.append(token.word());
        }
        sentenceList.add(sentenceSb.toString());
    }
    return (String[]) sentenceList.toArray(new String[sentenceList.size()]);
}

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private void initializeChart(List sentence) {
    int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);

    for (int start = 0; start < length; start++) {
        if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens.
            // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items
            // note we don't look for "words" including the end symbol!
            for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags)
                    || (start + 1 == end); end++) {
                StringBuilder word = new StringBuilder();
                //wsg: Feb 2010 - Appears to support character-level parsing
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }/* w ww.  jav a  2s . c om*/
                }
                for (int state = 0; state < numStates; state++) {
                    float iS = iScore[start][end][state];
                    if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
                        IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex,
                                tagIndex);
                        iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
                        if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
                            narrowRExtent[start][state] = start + 1;
                            narrowLExtent[end][state] = end - 1;
                            wideRExtent[start][state] = start + 1;
                            wideLExtent[end][state] = end - 1;
                        }
                    }
                }
            }

        } else { // "normal" chart initialization of the [start,start+1] cell

            int word = words[start];
            int end = start + 1;
            Arrays.fill(tags[start], false);

            float[] iScore_start_end = iScore[start][end];
            int[] narrowRExtent_start = narrowRExtent[start];
            int[] narrowLExtent_end = narrowLExtent[end];
            int[] wideRExtent_start = wideRExtent[start];
            int[] wideLExtent_end = wideLExtent[end];

            //Force tags
            String trueTagStr = null;
            if (sentence.get(start) instanceof HasTag) {
                trueTagStr = ((HasTag) sentence.get(start)).tag();
                if ("".equals(trueTagStr)) {
                    trueTagStr = null;
                }
            }

            // Another option for forcing tags: supply a regex
            String candidateTagRegex = null;
            /* if (sentence.get(start) instanceof CoreLabel) {
               candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class);
               if ("".equals(candidateTagRegex)) {
                 candidateTagRegex = null;
               }
             }
            */
            //Word context (e.g., morphosyntactic info)
            String wordContextStr = null;
            if (sentence.get(start) instanceof HasContext) {
                wordContextStr = ((HasContext) sentence.get(start)).originalText();
                if ("".equals(wordContextStr))
                    wordContextStr = null;
            }

            boolean assignedSomeTag = false;

            if (!floodTags || word == boundary) {
                // in this case we generate the taggings in the lexicon,
                // which may itself be tagging flexibly or using a strict lexicon.
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]",
                            "UTF-8");
                }
                for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start,
                        wordContextStr); taggingI.hasNext();) {
                    IntTaggedWord tagging = taggingI.next();
                    int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
                    // if word was supplied with a POS tag, skip all taggings
                    // not basicCategory() compatible with supplied tag.
                    if (trueTagStr != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
                            }
                            continue;
                        }
                    }
                    if (candidateTagRegex != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match candidateTagRegex: " + candidateTagRegex,
                                        "UTF-8");
                            }
                            continue;
                        }
                    }
                    // try {
                    float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon
                    if (lexScore > Float.NEGATIVE_INFINITY) {
                        assignedSomeTag = true;
                        iScore_start_end[state] = lexScore;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                    // } catch (Exception e) {
                    // e.printStackTrace();
                    // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
                    // }
                    int tag = tagging.tag;
                    tags[start][tag] = true;
                    if (dumpTagging) {
                        EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score "
                                + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state
                                + "]", "UTF-8");
                    }
                    //if (start == length-2 && tagging.parent == puncTag)
                    //  lastIsPunc = true;
                }
            } // end if ( ! floodTags || word == boundary)

            if (!assignedSomeTag) {
                // If you got here, either you were using forceTags (gold tags)
                // and the gold tag was not seen with that word in the training data
                // or we are in floodTags=true (recovery parse) mode
                // Here, we give words all tags for
                // which the lexicon score is not -Inf, not just seen or
                // specified taggings
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        if (trueTagStr != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
                                continue;
                            }
                        }

                        float lexScore = lex.score(
                                new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start,
                                wordIndex.get(word), wordContextStr);
                        if (candidateTagRegex != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
                                continue;
                            }
                        }

                        if (lexScore > Float.NEGATIVE_INFINITY) {
                            iScore_start_end[state] = lexScore;
                            narrowRExtent_start[state] = end;
                            narrowLExtent_end[state] = start;
                            wideRExtent_start[state] = end;
                            wideLExtent_end[state] = start;
                        }
                        if (dumpTagging) {
                            EncodingPrintWriter.err.println("Word pos " + start + " tagging "
                                    + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))))
                                    + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state)
                                    + " = " + state + "]", "UTF-8");
                        }
                    }
                }
            } // end if ! assignedSomeTag

            // tag multi-counting
            if (op.dcTags) {
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state]) {
                        iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
                    }
                }
            }

            if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
                // if parse failed because of tag coverage, we put in all tags with
                // a score of -1000, by fiat.  You get here from the invocation of
                // parse(ls) inside parse(ls) *after* floodTags has been turned on.
                // Search above for "floodTags = true".
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        iScore_start_end[state] = -1000.0f;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                }
            }

            // Apply unary rules in diagonal cells of chart
            if (spillGuts) {
                tick("Terminal Unary...");
            }
            for (int state = 0; state < numStates; state++) {
                float iS = iScore_start_end[state];
                if (iS == Float.NEGATIVE_INFINITY) {
                    continue;
                }
                UnaryRule[] unaries = ug.closedRulesByChild(state);
                for (UnaryRule ur : unaries) {
                    int parentState = ur.parent;
                    float pS = ur.score + lex.score(ur, start, end);
                    float tot = iS + pS;
                    if (tot > iScore_start_end[parentState]) {
                        iScore_start_end[parentState] = tot;
                        narrowRExtent_start[parentState] = end;
                        narrowLExtent_end[parentState] = start;
                        wideRExtent_start[parentState] = end;
                        wideLExtent_end[parentState] = start;
                    }
                }
            }
            if (spillGuts) {
                tick("Next word...");
            }
        }
    } // end for start
}

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private Tree extractBestParse(int goal, int start, int end) {
    // find source of inside score
    // no backtraces so we can speed up the parsing for its primary use
    double bestScore = iScore[start][end][goal];
    double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal])
            : bestScore;/*from  www  . j ava2 s.c  o m*/
    String goalStr = stateIndex.get(goal);

    // check tags
    if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) {
        if (op.testOptions.maxSpanForTags > 1) {
            Tree wordNode = null;
            if (sentence != null) {
                StringBuilder word = new StringBuilder();
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }
                }
                wordNode = tf.newLeaf(word.toString());

            } else if (lr != null) {
                List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end);
                for (LatticeEdge edge : latticeEdges) {
                    IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex);

                    float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null);
                    if (matches(bestScore, tagScore + (float) edge.weight)) {
                        wordNode = tf.newLeaf(edge.word);
                        if (wordNode.label() instanceof CoreLabel) {
                            CoreLabel cl = (CoreLabel) wordNode.label();
                            cl.setBeginPosition(start);
                            cl.setEndPosition(end);
                        }
                        break;
                    }
                }
                if (wordNode == null) {
                    throw new RuntimeException(
                            "could not find matching word from lattice in parse reconstruction");
                }

            } else {
                throw new RuntimeException("attempt to get word when sentence and lattice are null!");
            }
            Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
            tagNode.setScore(bestScore);
            if (originalTags[start] != null) {
                tagNode.label().setValue(originalTags[start].tag());
            }
            return tagNode;
        } else { // normal lexicon is single words case
            IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr));
            String contextStr = getCoreLabel(start).originalText();
            float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr);
            if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
                // return a pre-terminal tree
                CoreLabel terminalLabel = getCoreLabel(start);

                Tree wordNode = tf.newLeaf(terminalLabel);
                Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
                tagNode.setScore(bestScore);
                if (terminalLabel.tag() != null) {
                    tagNode.label().setValue(terminalLabel.tag());
                }
                if (tagNode.label() instanceof HasTag) {
                    ((HasTag) tagNode.label()).setTag(tagNode.label().value());
                }
                return tagNode;
            }
        }
    }
    // check binaries first
    for (int split = start + 1; split < end; split++) {
        for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext();) {
            BinaryRule br = binaryI.next();
            double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild]
                    + lex.score(br, start, end, split);
            boolean matches;
            if (op.testOptions.lengthNormalization) {
                double normScore = score
                        / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]);
                matches = matches(normScore, normBestScore);
            } else {
                matches = matches(score, bestScore);
            }
            if (matches) {
                // build binary split
                Tree leftChildTree = extractBestParse(br.leftChild, start, split);
                Tree rightChildTree = extractBestParse(br.rightChild, split, end);
                List<Tree> children = new ArrayList<Tree>();
                children.add(leftChildTree);
                children.add(rightChildTree);
                Tree result = tf.newTreeNode(goalStr, children);
                result.setScore(score);
                // System.err.println("    Found Binary node: "+result);
                return result;
            }
        }
    }
    // check unaries
    // note that even though we parse with the unary-closed grammar, we can
    // extract the best parse with the non-unary-closed grammar, since all
    // the intermediate states in the chain must have been built, and hence
    // we can exploit the sparser space and reconstruct the full tree as we go.
    // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) {
    for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext();) {
        UnaryRule ur = unaryI.next();
        // System.err.println("  Trying " + ur + " dtr score: " + iScore[start][end][ur.child]);
        double score = ur.score + iScore[start][end][ur.child] + lex.score(ur, start, end);
        boolean matches;
        if (op.testOptions.lengthNormalization) {
            double normScore = score / wordsInSpan[start][end][ur.child];
            matches = matches(normScore, normBestScore);
        } else {
            matches = matches(score, bestScore);
        }
        if (ur.child != ur.parent && matches) {
            // build unary
            Tree childTree = extractBestParse(ur.child, start, end);
            Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree));
            // System.err.println("    Matched!  Unary node: "+result);
            result.setScore(score);
            return result;
        }
    }
    System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start
            + ", " + end + "] looking for " + goalStr);
    return null;
}

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java

License:Open Source License

@SuppressWarnings("unchecked")
public static <T extends HasWord> List<T> applyPtbEscaping(List<T> words, Collection<String> quoteBegin,
        Collection<String> quoteEnd) {
    PTBEscapingProcessor<T, String, Word> escaper = new PTBEscapingProcessor<T, String, Word>();
    // Apply escaper to the whole sentence, not to each token individually. The
    // escaper takes context into account, e.g. when transforming regular double
    // quotes into PTB opening and closing quotes (`` and '').
    words = (List<T>) escaper.apply(words);

    for (HasWord w : words) {
        if (quoteBegin != null && quoteBegin.contains(w.word())) {
            w.setWord("``");
        } else if (quoteEnd != null && quoteEnd.contains(w.word())) {
            w.setWord("\'\'");
        }/*  www.j a  va2 s.com*/
    }

    return words;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils.java

License:Open Source License

public static List<HasWord> applyPtbEscaping(List<HasWord> words, Collection<String> quoteBegin,
        Collection<String> quoteEnd) {
    PTBEscapingProcessor<HasWord, String, Word> escaper = new PTBEscapingProcessor<HasWord, String, Word>();
    // Apply escaper to the whole sentence, not to each token individually. The
    // escaper takes context into account, e.g. when transforming regular double
    // quotes into PTB opening and closing quotes (`` and '').
    words = escaper.apply(words);//www  .  j  a  v a  2s.  co m

    for (HasWord w : words) {
        if (quoteBegin != null && quoteBegin.contains(w.word())) {
            w.setWord("``");
        } else if (quoteEnd != null && quoteEnd.contains(w.word())) {
            w.setWord("\'\'");
        }
    }

    return words;
}

From source file:edu.cmu.ark.AnalysisUtilities.java

License:Open Source License

public static List<String> getSentences(String document) {
    DocumentPreprocessor dp = new DocumentPreprocessor(false);
    List<String> res = new ArrayList<String>();
    String sentence;/* ww  w. j ava  2 s  .  co  m*/

    document = preprocess(document);

    String[] paragraphs = document.split("\\n");

    for (int i = 0; i < paragraphs.length; i++) {
        StringReader reader = new StringReader(paragraphs[i]);
        List<List<? extends HasWord>> sents = new ArrayList<List<? extends HasWord>>();

        try {
            sents = dp.getSentencesFromText(reader);
        } catch (Exception e) {
            e.printStackTrace();
        }

        for (List<? extends HasWord> tmp1 : sents) {
            sentence = "";
            for (HasWord tmp2 : tmp1) {
                String tmp = tmp2.word().toString();
                sentence += tmp + " ";
            }
            sentence = sentence.trim();
            res.add(sentence);
        }
    }

    return res;
}

From source file:edu.nyu.nyuvis.cfutils.nlp.utils.CoreNLP.java

public static Word convert(HasWord hw) {
    Word w = new Word();
    w.word(hw.word());
    return w;
}

From source file:nlp.morph.noun.MainNounDetetectionLayer.java

License:Open Source License

public static void main(String[] args) throws Exception {

    InitSystem.init();//w w  w  .  ja v a  2  s  . c o m
    if (args.length == 0) {
        System.out.println("Usage : java <Input File>");
        System.exit(0);
    }

    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[0])));

    PrintWriter writer;
    try {
        writer = new PrintWriter("output.txt", "UTF-8");

        sentences.stream().map((input) -> {
            String[] words = new String[input.size()];
            int i = 0;
            for (HasWord l : input) {
                words[i] = l.word();
                i++;
            }

            return words;
        }).map((String[] words) -> {
            String wordBefore = "";
            for (int index = 0; index < words.length; index++) {

                String temp = words[index].replaceAll("[.,'?/\"%]", "");
                String outputSentence = "";
                String stemSentence = "";
                if (isNumeric(temp)) {
                    outputSentence += words[index] + "\t<NUM> ";
                    stemSentence += words[index] + "\t<NUM> ";
                } else if (NounListCheck.isExist(words[index])) {

                    outputSentence += words[index] + "\t<NOUN> ";

                    stemSentence += words[index] + "\t<NOUN> ";
                } else {
                    List<TamilFontEntity> tamilWord = IOLayer.getTamil(temp);

                    List<TamilFontEntity> cleanWord = tamilWord;

                    if (!tamilWord.isEmpty()) {
                        if (index != words.length - 1 && !words[index].contains(".")
                                && tamilWord.get(tamilWord.size() - 1).getxLocation() != 9
                                && tamilWord.get(tamilWord.size() - 1).getyLocation() == -1
                                && IOLayer.getTamil(words[index + 1]).get(0).getxLocation() == tamilWord
                                        .get(tamilWord.size() - 1).getxLocation()) {
                            cleanWord = tamilWord.subList(0, tamilWord.size() - 1);

                        }
                    }

                    String tempWord = IOLayer.getText(cleanWord).toString();

                    if (tempWord.length() > 0) {
                        if (index == words.length - 2) {
                            if (cleanWord.size() > 2) {
                                outputSentence += words[index] + "\t<VERB>";
                                stemSentence += words[index] + "\t<VERB>";
                            } else {
                                outputSentence += words[index] + "\t<NOTDEFINED>";
                                stemSentence += words[index] + "\t<NOTDEFINED>";
                            }
                        } else {
                            List<TamilFontEntity> stem = detectStems(tempWord, wordBefore);
                            if (NumberDetectorLayer.isNumber(cleanWord)) {
                                outputSentence += words[index] + "\t<NUM> ";
                                stemSentence += words[index] + "\t<NUM> ";
                            } else if (PronounAndArticleDetector.isArticle(tempWord)) {
                                outputSentence += words[index] + "\t<ATL> ";
                                stemSentence += words[index] + "\t<ATL> ";
                            } else if (PronounAndArticleDetector.isProNoun(tempWord)) {
                                outputSentence += words[index] + "\t<PNN> ";
                                stemSentence += words[index] + "\t<PNN> ";
                            } else if (stem != null) {
                                outputSentence += words[index] + "\t<NOUN> ";
                                stemSentence += IOLayer.getText(stem) + "\t<NOUN> ";
                            } else {
                                stem = getNounStem(tempWord);

                                if (stem != null) {

                                    outputSentence += words[index] + "\t<NOUN> ";
                                    stemSentence += IOLayer.getText(stem) + "\t<NOUN> ";

                                } else if (ExtractNounStemMLayer.extractStemNounM(tempWord) != null) {
                                    List<TamilFontEntity> stemWithoutM = ExtractNounStemMLayer
                                            .extractStemNounM(tempWord);

                                    stemWithoutM = extractPluralStem(stemWithoutM);

                                    if (stemWithoutM.equals(IOLayer.getTamil(tempWord))) {
                                        outputSentence += IOLayer.getText(stemWithoutM) + "\t<JOIN> ";
                                        stemSentence += IOLayer.getText(stemWithoutM) + "\t<JOIN> ";
                                    } else {
                                        outputSentence += IOLayer.getText(stemWithoutM) + "\t<NOUN> ";
                                        stemSentence += IOLayer.getText(stemWithoutM) + "\t<NOUN> ";
                                    }
                                } else {
                                    outputSentence += IOLayer.getText(cleanWord) + "\t<NOTDEFINED> ";
                                    stemSentence += IOLayer.getText(cleanWord) + "\t<NOTDEFINED> ";
                                }
                            }
                        }
                    }
                }

                if (temp.trim().equals("")) {
                    writer.println(words[index] + "\t<SYM>");

                } else {
                    writer.println(stemSentence);
                    wordBefore = words[index];
                }
            }
            return wordBefore;
        }).map((wordBefore) -> IOLayer.getTamil(wordBefore)).filter((tamilWord) -> (tamilWord.size() > 2))
                .forEach((_item) -> {
                    writer.println("");
                });
        writer.close();
    } catch (FileNotFoundException | UnsupportedEncodingException e) {
    }
}

From source file:nlp.morph.noun.MainNounDetetectionLayer.java

License:Open Source License

public static List<List<String[]>> getMorph(String data) {

    PrintWriter pw = null;//from  w  ww .  j  av  a 2  s .  c o  m
    try {
        List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(data));
        int count = sentences.size();
        List<List<String[]>> morphOutput = new ArrayList<>();
        ArrayList<String[]> list = new ArrayList<>();
        boolean checkSentence = true;

        pw = new PrintWriter(new File("resources/morphed_stem.txt"));
        for (List<HasWord> input : sentences) {
            String[] words = new String[input.size()];

            int i = 0;
            for (HasWord l : input) {
                words[i] = l.word();

                i++;
            }

            String wordBefore = "";

            if (checkSentence) {
                if (list.size() > 0) {
                    morphOutput.add(list);
                }
                list = new ArrayList<>();
                checkSentence = false;
                pw.println();
            }

            for (int index = 0; index < words.length; index++) {

                String temp = words[index].replaceAll("[.,/?\"'`#$%]", "");
                String outputSentence = "";
                String stemSentence = "";
                if (isNumeric(temp)) {
                    outputSentence = "NUM";
                    stemSentence = temp;

                } else if (NounListCheck.isExist(words[index])) {

                    outputSentence = "NOUN";
                    stemSentence = words[index];

                } else {
                    List<TamilFontEntity> tamilWord = IOLayer.getTamil(temp);

                    List<TamilFontEntity> cleanWord = tamilWord;

                    if (!tamilWord.isEmpty()) {
                        if (index != words.length - 1 && !words[index].contains(".")
                                && tamilWord.get(tamilWord.size() - 1).getxLocation() != 9
                                && tamilWord.get(tamilWord.size() - 1).getyLocation() == -1
                                && IOLayer.getTamil(words[index + 1]).get(0).getxLocation() == tamilWord
                                        .get(tamilWord.size() - 1).getxLocation()) {
                            cleanWord = tamilWord.subList(0, tamilWord.size() - 1);

                        }
                    }

                    String tempWord = IOLayer.getText(cleanWord).toString();

                    if (tempWord.length() > 0) {
                        if (index == words.length - 2) {
                            if (cleanWord.size() > 2) {

                                outputSentence = "VERB";
                                stemSentence = words[index];

                            } else {
                                outputSentence = "NOTDEFINED";
                                stemSentence = IOLayer.getText(cleanWord).toString();
                            }

                        } else {
                            List<TamilFontEntity> stem = detectStems(tempWord, wordBefore);

                            if (NumberDetectorLayer.isNumber(cleanWord)) {
                                outputSentence = "NUM";
                                stemSentence = IOLayer.getText(cleanWord).toString();

                            } else if (PronounAndArticleDetector.isArticle(tempWord)) {
                                outputSentence = "ATL";
                                stemSentence = tempWord;

                            } else if (PronounAndArticleDetector.isProNoun(tempWord)) {
                                outputSentence = "PNN";
                                stemSentence = tempWord;

                            } else if (stem != null) {
                                outputSentence = "NOUN";
                                stemSentence = tempWord;

                            } else {
                                stem = getNounStem(tempWord);

                                if (stem != null) {

                                    outputSentence = "NOUN";
                                    stemSentence = IOLayer.getText(stem).toString();

                                } else if (ExtractNounStemMLayer.extractStemNounM(tempWord) != null) {
                                    List<TamilFontEntity> stemWithoutM = ExtractNounStemMLayer
                                            .extractStemNounM(tempWord);

                                    stemWithoutM = extractPluralStem(stemWithoutM);
                                    stemSentence = IOLayer.getText(stemWithoutM).toString();
                                    if (stemWithoutM.equals(IOLayer.getTamil(tempWord))) {
                                        outputSentence = "JOIN";

                                    } else {
                                        outputSentence = "NOUN";
                                        stemSentence = words[index];
                                    }
                                } else {
                                    outputSentence = "NOTDEFINED";

                                }
                            }
                        }
                    }
                }

                String[] word = new String[2];

                if (words[index].startsWith("?") || words[index].startsWith("?")
                        || words[index].startsWith("")) {
                    String[] w = list.remove(list.size() - 1);
                    w[0] = w[0] + words[index];
                    list.add(w);
                } else if (temp.trim().equals("")) {
                    word[0] = words[index];
                    word[1] = "SYM";

                    pw.println(word[0]);
                    list.add(word);

                } else {
                    word[0] = words[index];
                    if (word[0].contains(".") && !word[0].matches(".*\\d.*")) {
                        String[] wo = word[0].split("\\.");
                        int j = 0;
                        for (int k = 0; k < words[index].length();) {
                            if (String.valueOf(words[index].charAt(k)).contains(".")) {
                                word = new String[2];
                                word[0] = ".";
                                word[1] = "SYM";
                                pw.println(word[0]);

                                list.add(word);
                                List<TamilFontEntity> tamilWord = IOLayer.getTamil(wordBefore);
                                if (tamilWord.size() > 2 && !wordBefore.contains("?")
                                        && !wordBefore.contains("?")
                                        && !wordBefore.contains("?")
                                        && !wordBefore.contains("?") && !isNumeric(wordBefore)) {
                                    if (list.size() > 0) {
                                        morphOutput.add(list);
                                    }
                                    pw.println();
                                    list = new ArrayList<>();

                                }

                                list.add(word);
                                k++;
                            } else {
                                word = new String[2];
                                word[0] = wo[j];
                                pw.println(wo[j]);
                                if (isNumeric(word[0])) {
                                    word[1] = "NUM";
                                } else {
                                    word[1] = "NOTDEFINED";
                                }

                                list.add(word);

                                k = k + wo[j].length();
                                wordBefore = wo[j];
                                j++;
                            }
                        }

                    } else {
                        pw.println(stemSentence);
                        word[0] = words[index];
                        pw.println(word[0]);
                        word[1] = outputSentence;
                        list.add(word);
                        wordBefore = word[0];
                    }

                }

            }
            List<TamilFontEntity> tamilWord = IOLayer.getTamil(wordBefore);

            if (tamilWord.size() > 2 && !wordBefore.contains("?")
                    && !wordBefore.contains("?") && !wordBefore.contains("?")
                    && !isNumeric(wordBefore)) {
                checkSentence = true;

                pw.println();
            }

        }
        if (list.size() > 0) {
            morphOutput.add(list);
        }
        return morphOutput;
    } catch (FileNotFoundException ex) {
        Logger.getLogger(MainNounDetetectionLayer.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
        pw.close();
    }
    return null;

}