List of usage examples for edu.stanford.nlp.ling HasWord word
public String word();
From source file:at.illecker.storm.commons.tokenizer.Tokenizer.java
License:Apache License
public static List<String> tokenize(String str, Type type) { // Step 1) Trim text str = str.trim();/* w w w .j av a2s . c om*/ // Step 2) Replace Unicode symbols \u0000 if (UnicodeUtils.containsUnicode(str)) { String replacedText = UnicodeUtils.replaceUnicodeSymbols(str); // LOG.info("Replaced Unicode symbols from '" + str + "' to '" // + replacedText + "'"); if ((LOGGING) && (replacedText.equals(str))) { LOG.warn("Unicode symbols could not be replaced: '" + str + "'"); } str = replacedText; } // Step 3) Replace HTML symbols &#[0-9]; if (HtmlUtils.containsHtml(str)) { String replacedText = HtmlUtils.replaceHtmlSymbols(str); // LOG.info("Replaced HTML symbols from '" + text + "' to '" // + replacedText + "'"); if ((LOGGING) && (replacedText.equals(str))) { LOG.warn("HTML symbols could not be replaced: '" + str + "'"); } str = replacedText; } // Step 4) Tokenize List<String> tokenizedTokens = null; switch (type) { case REGEX_TOKENIZER: tokenizedTokens = new ArrayList<String>(); Matcher m = RegexUtils.TOKENIZER_PATTERN.matcher(str); while (m.find()) { tokenizedTokens.add(m.group()); } break; case ARK_TOKENIZER: tokenizedTokens = Twokenize.tokenize(str); break; case STANFORD_TOKENIZER: TokenizerFactory<Word> tokenizer = PTBTokenizerFactory.newTokenizerFactory(); tokenizer.setOptions("ptb3Escaping=false"); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(str), tokenizer); // Convert sentences to List<String> tokenizedTokens = new ArrayList<String>(); for (List<HasWord> sentence : sentences) { for (HasWord word : sentence) { tokenizedTokens.add(word.word()); } } break; default: break; } return tokenizedTokens; }
From source file:com.summarizer.Utilities.java
License:Apache License
public static String[] sentenceTokonizer(String entireDoc) { Reader reader = new StringReader(entireDoc); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new LinkedList<String>(); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() > 1) { sentenceSb.append(" "); }/*from w ww . j a v a 2s . c om*/ sentenceSb.append(token.word()); } sentenceList.add(sentenceSb.toString()); } return (String[]) sentenceList.toArray(new String[sentenceList.size()]); }
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private void initializeChart(List sentence) { int boundary = wordIndex.indexOf(Lexicon.BOUNDARY); for (int start = 0; start < length; start++) { if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens. // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items // note we don't look for "words" including the end symbol! for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) { StringBuilder word = new StringBuilder(); //wsg: Feb 2010 - Appears to support character-level parsing for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = (HasWord) sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); }/* w ww. jav a 2s . c om*/ } for (int state = 0; state < numStates; state++) { float iS = iScore[start][end][state]; if (iS == Float.NEGATIVE_INFINITY && isTag[state]) { IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex); iScore[start][end][state] = lex.score(itw, start, word.toString(), null); if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) { narrowRExtent[start][state] = start + 1; narrowLExtent[end][state] = end - 1; wideRExtent[start][state] = start + 1; wideLExtent[end][state] = end - 1; } } } } } else { // "normal" chart initialization of the [start,start+1] cell int word = words[start]; int end = start + 1; Arrays.fill(tags[start], false); float[] iScore_start_end = iScore[start][end]; int[] narrowRExtent_start = narrowRExtent[start]; int[] narrowLExtent_end = narrowLExtent[end]; int[] wideRExtent_start = wideRExtent[start]; int[] wideLExtent_end = wideLExtent[end]; //Force tags String trueTagStr = null; if (sentence.get(start) instanceof HasTag) { trueTagStr = ((HasTag) sentence.get(start)).tag(); if ("".equals(trueTagStr)) { trueTagStr = null; } } // Another option for forcing tags: supply a regex String candidateTagRegex = null; /* if (sentence.get(start) instanceof CoreLabel) { candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class); if ("".equals(candidateTagRegex)) { candidateTagRegex = null; } } */ //Word context (e.g., morphosyntactic info) String wordContextStr = null; if (sentence.get(start) instanceof HasContext) { wordContextStr = ((HasContext) sentence.get(start)).originalText(); if ("".equals(wordContextStr)) wordContextStr = null; } boolean assignedSomeTag = false; if (!floodTags || word == boundary) { // in this case we generate the taggings in the lexicon, // which may itself be tagging flexibly or using a strict lexicon. if (dumpTagging) { EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8"); } for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext();) { IntTaggedWord tagging = taggingI.next(); int state = stateIndex.indexOf(tagIndex.get(tagging.tag)); // if word was supplied with a POS tag, skip all taggings // not basicCategory() compatible with supplied tag. if (trueTagStr != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8"); } continue; } } if (candidateTagRegex != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8"); } continue; } } // try { float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon if (lexScore > Float.NEGATIVE_INFINITY) { assignedSomeTag = true; iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag)); // } int tag = tagging.tag; tags[start][tag] = true; if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } //if (start == length-2 && tagging.parent == puncTag) // lastIsPunc = true; } } // end if ( ! floodTags || word == boundary) if (!assignedSomeTag) { // If you got here, either you were using forceTags (gold tags) // and the gold tag was not seen with that word in the training data // or we are in floodTags=true (recovery parse) mode // Here, we give words all tags for // which the lexicon score is not -Inf, not just seen or // specified taggings if (dumpTagging) { EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { if (trueTagStr != null) { String tagString = stateIndex.get(state); if (!tlp.basicCategory(tagString).equals(trueTagStr)) { continue; } } float lexScore = lex.score( new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr); if (candidateTagRegex != null) { String tagString = stateIndex.get(state); if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) { continue; } } if (lexScore > Float.NEGATIVE_INFINITY) { iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } } } } // end if ! assignedSomeTag // tag multi-counting if (op.dcTags) { for (int state = 0; state < numStates; state++) { if (isTag[state]) { iScore_start_end[state] *= (1.0 + op.testOptions.depWeight); } } } if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) { // if parse failed because of tag coverage, we put in all tags with // a score of -1000, by fiat. You get here from the invocation of // parse(ls) inside parse(ls) *after* floodTags has been turned on. // Search above for "floodTags = true". if (dumpTagging) { EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { iScore_start_end[state] = -1000.0f; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } } } // Apply unary rules in diagonal cells of chart if (spillGuts) { tick("Terminal Unary..."); } for (int state = 0; state < numStates; state++) { float iS = iScore_start_end[state]; if (iS == Float.NEGATIVE_INFINITY) { continue; } UnaryRule[] unaries = ug.closedRulesByChild(state); for (UnaryRule ur : unaries) { int parentState = ur.parent; float pS = ur.score + lex.score(ur, start, end); float tot = iS + pS; if (tot > iScore_start_end[parentState]) { iScore_start_end[parentState] = tot; narrowRExtent_start[parentState] = end; narrowLExtent_end[parentState] = start; wideRExtent_start[parentState] = end; wideLExtent_end[parentState] = start; } } } if (spillGuts) { tick("Next word..."); } } } // end for start }
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private Tree extractBestParse(int goal, int start, int end) { // find source of inside score // no backtraces so we can speed up the parsing for its primary use double bestScore = iScore[start][end][goal]; double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal]) : bestScore;/*from www . j ava2 s.c o m*/ String goalStr = stateIndex.get(goal); // check tags if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) { if (op.testOptions.maxSpanForTags > 1) { Tree wordNode = null; if (sentence != null) { StringBuilder word = new StringBuilder(); for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = (HasWord) sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); } } wordNode = tf.newLeaf(word.toString()); } else if (lr != null) { List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end); for (LatticeEdge edge : latticeEdges) { IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex); float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null); if (matches(bestScore, tagScore + (float) edge.weight)) { wordNode = tf.newLeaf(edge.word); if (wordNode.label() instanceof CoreLabel) { CoreLabel cl = (CoreLabel) wordNode.label(); cl.setBeginPosition(start); cl.setEndPosition(end); } break; } } if (wordNode == null) { throw new RuntimeException( "could not find matching word from lattice in parse reconstruction"); } } else { throw new RuntimeException("attempt to get word when sentence and lattice are null!"); } Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (originalTags[start] != null) { tagNode.label().setValue(originalTags[start].tag()); } return tagNode; } else { // normal lexicon is single words case IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr)); String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) { // return a pre-terminal tree CoreLabel terminalLabel = getCoreLabel(start); Tree wordNode = tf.newLeaf(terminalLabel); Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (terminalLabel.tag() != null) { tagNode.label().setValue(terminalLabel.tag()); } if (tagNode.label() instanceof HasTag) { ((HasTag) tagNode.label()).setTag(tagNode.label().value()); } return tagNode; } } } // check binaries first for (int split = start + 1; split < end; split++) { for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext();) { BinaryRule br = binaryI.next(); double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild] + lex.score(br, start, end, split); boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]); matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (matches) { // build binary split Tree leftChildTree = extractBestParse(br.leftChild, start, split); Tree rightChildTree = extractBestParse(br.rightChild, split, end); List<Tree> children = new ArrayList<Tree>(); children.add(leftChildTree); children.add(rightChildTree); Tree result = tf.newTreeNode(goalStr, children); result.setScore(score); // System.err.println(" Found Binary node: "+result); return result; } } } // check unaries // note that even though we parse with the unary-closed grammar, we can // extract the best parse with the non-unary-closed grammar, since all // the intermediate states in the chain must have been built, and hence // we can exploit the sparser space and reconstruct the full tree as we go. // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) { for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext();) { UnaryRule ur = unaryI.next(); // System.err.println(" Trying " + ur + " dtr score: " + iScore[start][end][ur.child]); double score = ur.score + iScore[start][end][ur.child] + lex.score(ur, start, end); boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / wordsInSpan[start][end][ur.child]; matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (ur.child != ur.parent && matches) { // build unary Tree childTree = extractBestParse(ur.child, start, end); Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree)); // System.err.println(" Matched! Unary node: "+result); result.setScore(score); return result; } } System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start + ", " + end + "] looking for " + goalStr); return null; }
From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java
License:Open Source License
@SuppressWarnings("unchecked") public static <T extends HasWord> List<T> applyPtbEscaping(List<T> words, Collection<String> quoteBegin, Collection<String> quoteEnd) { PTBEscapingProcessor<T, String, Word> escaper = new PTBEscapingProcessor<T, String, Word>(); // Apply escaper to the whole sentence, not to each token individually. The // escaper takes context into account, e.g. when transforming regular double // quotes into PTB opening and closing quotes (`` and ''). words = (List<T>) escaper.apply(words); for (HasWord w : words) { if (quoteBegin != null && quoteBegin.contains(w.word())) { w.setWord("``"); } else if (quoteEnd != null && quoteEnd.contains(w.word())) { w.setWord("\'\'"); }/* www.j a va2 s.com*/ } return words; }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils.java
License:Open Source License
public static List<HasWord> applyPtbEscaping(List<HasWord> words, Collection<String> quoteBegin, Collection<String> quoteEnd) { PTBEscapingProcessor<HasWord, String, Word> escaper = new PTBEscapingProcessor<HasWord, String, Word>(); // Apply escaper to the whole sentence, not to each token individually. The // escaper takes context into account, e.g. when transforming regular double // quotes into PTB opening and closing quotes (`` and ''). words = escaper.apply(words);//www . j a v a 2s. co m for (HasWord w : words) { if (quoteBegin != null && quoteBegin.contains(w.word())) { w.setWord("``"); } else if (quoteEnd != null && quoteEnd.contains(w.word())) { w.setWord("\'\'"); } } return words; }
From source file:edu.cmu.ark.AnalysisUtilities.java
License:Open Source License
public static List<String> getSentences(String document) { DocumentPreprocessor dp = new DocumentPreprocessor(false); List<String> res = new ArrayList<String>(); String sentence;/* ww w. j ava 2 s . co m*/ document = preprocess(document); String[] paragraphs = document.split("\\n"); for (int i = 0; i < paragraphs.length; i++) { StringReader reader = new StringReader(paragraphs[i]); List<List<? extends HasWord>> sents = new ArrayList<List<? extends HasWord>>(); try { sents = dp.getSentencesFromText(reader); } catch (Exception e) { e.printStackTrace(); } for (List<? extends HasWord> tmp1 : sents) { sentence = ""; for (HasWord tmp2 : tmp1) { String tmp = tmp2.word().toString(); sentence += tmp + " "; } sentence = sentence.trim(); res.add(sentence); } } return res; }
From source file:edu.nyu.nyuvis.cfutils.nlp.utils.CoreNLP.java
public static Word convert(HasWord hw) { Word w = new Word(); w.word(hw.word()); return w; }
From source file:nlp.morph.noun.MainNounDetetectionLayer.java
License:Open Source License
public static void main(String[] args) throws Exception { InitSystem.init();//w w w . ja v a 2 s . c o m if (args.length == 0) { System.out.println("Usage : java <Input File>"); System.exit(0); } List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[0]))); PrintWriter writer; try { writer = new PrintWriter("output.txt", "UTF-8"); sentences.stream().map((input) -> { String[] words = new String[input.size()]; int i = 0; for (HasWord l : input) { words[i] = l.word(); i++; } return words; }).map((String[] words) -> { String wordBefore = ""; for (int index = 0; index < words.length; index++) { String temp = words[index].replaceAll("[.,'?/\"%]", ""); String outputSentence = ""; String stemSentence = ""; if (isNumeric(temp)) { outputSentence += words[index] + "\t<NUM> "; stemSentence += words[index] + "\t<NUM> "; } else if (NounListCheck.isExist(words[index])) { outputSentence += words[index] + "\t<NOUN> "; stemSentence += words[index] + "\t<NOUN> "; } else { List<TamilFontEntity> tamilWord = IOLayer.getTamil(temp); List<TamilFontEntity> cleanWord = tamilWord; if (!tamilWord.isEmpty()) { if (index != words.length - 1 && !words[index].contains(".") && tamilWord.get(tamilWord.size() - 1).getxLocation() != 9 && tamilWord.get(tamilWord.size() - 1).getyLocation() == -1 && IOLayer.getTamil(words[index + 1]).get(0).getxLocation() == tamilWord .get(tamilWord.size() - 1).getxLocation()) { cleanWord = tamilWord.subList(0, tamilWord.size() - 1); } } String tempWord = IOLayer.getText(cleanWord).toString(); if (tempWord.length() > 0) { if (index == words.length - 2) { if (cleanWord.size() > 2) { outputSentence += words[index] + "\t<VERB>"; stemSentence += words[index] + "\t<VERB>"; } else { outputSentence += words[index] + "\t<NOTDEFINED>"; stemSentence += words[index] + "\t<NOTDEFINED>"; } } else { List<TamilFontEntity> stem = detectStems(tempWord, wordBefore); if (NumberDetectorLayer.isNumber(cleanWord)) { outputSentence += words[index] + "\t<NUM> "; stemSentence += words[index] + "\t<NUM> "; } else if (PronounAndArticleDetector.isArticle(tempWord)) { outputSentence += words[index] + "\t<ATL> "; stemSentence += words[index] + "\t<ATL> "; } else if (PronounAndArticleDetector.isProNoun(tempWord)) { outputSentence += words[index] + "\t<PNN> "; stemSentence += words[index] + "\t<PNN> "; } else if (stem != null) { outputSentence += words[index] + "\t<NOUN> "; stemSentence += IOLayer.getText(stem) + "\t<NOUN> "; } else { stem = getNounStem(tempWord); if (stem != null) { outputSentence += words[index] + "\t<NOUN> "; stemSentence += IOLayer.getText(stem) + "\t<NOUN> "; } else if (ExtractNounStemMLayer.extractStemNounM(tempWord) != null) { List<TamilFontEntity> stemWithoutM = ExtractNounStemMLayer .extractStemNounM(tempWord); stemWithoutM = extractPluralStem(stemWithoutM); if (stemWithoutM.equals(IOLayer.getTamil(tempWord))) { outputSentence += IOLayer.getText(stemWithoutM) + "\t<JOIN> "; stemSentence += IOLayer.getText(stemWithoutM) + "\t<JOIN> "; } else { outputSentence += IOLayer.getText(stemWithoutM) + "\t<NOUN> "; stemSentence += IOLayer.getText(stemWithoutM) + "\t<NOUN> "; } } else { outputSentence += IOLayer.getText(cleanWord) + "\t<NOTDEFINED> "; stemSentence += IOLayer.getText(cleanWord) + "\t<NOTDEFINED> "; } } } } } if (temp.trim().equals("")) { writer.println(words[index] + "\t<SYM>"); } else { writer.println(stemSentence); wordBefore = words[index]; } } return wordBefore; }).map((wordBefore) -> IOLayer.getTamil(wordBefore)).filter((tamilWord) -> (tamilWord.size() > 2)) .forEach((_item) -> { writer.println(""); }); writer.close(); } catch (FileNotFoundException | UnsupportedEncodingException e) { } }
From source file:nlp.morph.noun.MainNounDetetectionLayer.java
License:Open Source License
public static List<List<String[]>> getMorph(String data) { PrintWriter pw = null;//from w ww . j av a 2 s . c o m try { List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(data)); int count = sentences.size(); List<List<String[]>> morphOutput = new ArrayList<>(); ArrayList<String[]> list = new ArrayList<>(); boolean checkSentence = true; pw = new PrintWriter(new File("resources/morphed_stem.txt")); for (List<HasWord> input : sentences) { String[] words = new String[input.size()]; int i = 0; for (HasWord l : input) { words[i] = l.word(); i++; } String wordBefore = ""; if (checkSentence) { if (list.size() > 0) { morphOutput.add(list); } list = new ArrayList<>(); checkSentence = false; pw.println(); } for (int index = 0; index < words.length; index++) { String temp = words[index].replaceAll("[.,/?\"'`#$%]", ""); String outputSentence = ""; String stemSentence = ""; if (isNumeric(temp)) { outputSentence = "NUM"; stemSentence = temp; } else if (NounListCheck.isExist(words[index])) { outputSentence = "NOUN"; stemSentence = words[index]; } else { List<TamilFontEntity> tamilWord = IOLayer.getTamil(temp); List<TamilFontEntity> cleanWord = tamilWord; if (!tamilWord.isEmpty()) { if (index != words.length - 1 && !words[index].contains(".") && tamilWord.get(tamilWord.size() - 1).getxLocation() != 9 && tamilWord.get(tamilWord.size() - 1).getyLocation() == -1 && IOLayer.getTamil(words[index + 1]).get(0).getxLocation() == tamilWord .get(tamilWord.size() - 1).getxLocation()) { cleanWord = tamilWord.subList(0, tamilWord.size() - 1); } } String tempWord = IOLayer.getText(cleanWord).toString(); if (tempWord.length() > 0) { if (index == words.length - 2) { if (cleanWord.size() > 2) { outputSentence = "VERB"; stemSentence = words[index]; } else { outputSentence = "NOTDEFINED"; stemSentence = IOLayer.getText(cleanWord).toString(); } } else { List<TamilFontEntity> stem = detectStems(tempWord, wordBefore); if (NumberDetectorLayer.isNumber(cleanWord)) { outputSentence = "NUM"; stemSentence = IOLayer.getText(cleanWord).toString(); } else if (PronounAndArticleDetector.isArticle(tempWord)) { outputSentence = "ATL"; stemSentence = tempWord; } else if (PronounAndArticleDetector.isProNoun(tempWord)) { outputSentence = "PNN"; stemSentence = tempWord; } else if (stem != null) { outputSentence = "NOUN"; stemSentence = tempWord; } else { stem = getNounStem(tempWord); if (stem != null) { outputSentence = "NOUN"; stemSentence = IOLayer.getText(stem).toString(); } else if (ExtractNounStemMLayer.extractStemNounM(tempWord) != null) { List<TamilFontEntity> stemWithoutM = ExtractNounStemMLayer .extractStemNounM(tempWord); stemWithoutM = extractPluralStem(stemWithoutM); stemSentence = IOLayer.getText(stemWithoutM).toString(); if (stemWithoutM.equals(IOLayer.getTamil(tempWord))) { outputSentence = "JOIN"; } else { outputSentence = "NOUN"; stemSentence = words[index]; } } else { outputSentence = "NOTDEFINED"; } } } } } String[] word = new String[2]; if (words[index].startsWith("?") || words[index].startsWith("?") || words[index].startsWith("")) { String[] w = list.remove(list.size() - 1); w[0] = w[0] + words[index]; list.add(w); } else if (temp.trim().equals("")) { word[0] = words[index]; word[1] = "SYM"; pw.println(word[0]); list.add(word); } else { word[0] = words[index]; if (word[0].contains(".") && !word[0].matches(".*\\d.*")) { String[] wo = word[0].split("\\."); int j = 0; for (int k = 0; k < words[index].length();) { if (String.valueOf(words[index].charAt(k)).contains(".")) { word = new String[2]; word[0] = "."; word[1] = "SYM"; pw.println(word[0]); list.add(word); List<TamilFontEntity> tamilWord = IOLayer.getTamil(wordBefore); if (tamilWord.size() > 2 && !wordBefore.contains("?") && !wordBefore.contains("?") && !wordBefore.contains("?") && !wordBefore.contains("?") && !isNumeric(wordBefore)) { if (list.size() > 0) { morphOutput.add(list); } pw.println(); list = new ArrayList<>(); } list.add(word); k++; } else { word = new String[2]; word[0] = wo[j]; pw.println(wo[j]); if (isNumeric(word[0])) { word[1] = "NUM"; } else { word[1] = "NOTDEFINED"; } list.add(word); k = k + wo[j].length(); wordBefore = wo[j]; j++; } } } else { pw.println(stemSentence); word[0] = words[index]; pw.println(word[0]); word[1] = outputSentence; list.add(word); wordBefore = word[0]; } } } List<TamilFontEntity> tamilWord = IOLayer.getTamil(wordBefore); if (tamilWord.size() > 2 && !wordBefore.contains("?") && !wordBefore.contains("?") && !wordBefore.contains("?") && !isNumeric(wordBefore)) { checkSentence = true; pw.println(); } } if (list.size() > 0) { morphOutput.add(list); } return morphOutput; } catch (FileNotFoundException ex) { Logger.getLogger(MainNounDetetectionLayer.class.getName()).log(Level.SEVERE, null, ex); } finally { pw.close(); } return null; }