List of usage examples for edu.stanford.nlp.ie.ner CMMClassifier getClassifier
public static CMMClassifier<? extends CoreLabel> getClassifier(InputStream in) throws IOException, ClassCastException, ClassNotFoundException
From source file:lv.lumii.expressions.Expression.java
License:Open Source License
public static void initClassifier(String model) throws Exception { morphoClassifier = CMMClassifier.getClassifier(model); analyzer = LVMorphologyReaderAndWriter.getAnalyzer(); // Assumption - that the morphology model actually loads the LVMorphologyReaderAndWriter data, so it should be filled. }// ww w.ja v a 2 s .com
From source file:lv.lumii.morphotagger.MorphoCRF.java
License:Open Source License
/** * @param args/*from w ww. ja va 2s.com*/ * @throws IOException * @throws ClassNotFoundException * @throws ClassCastException */ public static void main(String[] args) throws IOException, ClassCastException, ClassNotFoundException { String trainfile = "MorphoCRF/train_dev.txt"; String testfile = "MorphoCRF/test.txt"; boolean train = false; for (int i = 0; i < args.length; i++) { if (args[i].equalsIgnoreCase("-train")) { train = true; } if (args[i].equalsIgnoreCase("-dev")) { trainfile = "MorphoCRF/train.txt"; testfile = "MorphoCRF/dev.txt"; } if (args[i].equalsIgnoreCase("-production")) { trainfile = "MorphoCRF/all.txt"; testfile = "MorphoCRF/test.txt"; } } String pretrainedModel = "models/lv-morpho-model.ser.gz"; String classifierOutput = "MorphoCRF/lv-morpho-model.ser.gz"; //Properties props = StringUtils.propFileToProperties("/Users/pet/Documents/java/PaikensNER/MorfoCRF/lv-PP.prop"); Properties props = new Properties(); props.setProperty("useLVMorphoAnalyzer", "true"); props.setProperty("LVMorphoAnalyzerTag", AttributeNames.i_PartOfSpeech); //props.setProperty("LVMorphoAnalyzerTag", AttributeNames.i_Case); props.setProperty("useLVMorphoAnalyzerPOS", "true"); props.setProperty("useLVMorphoAnalyzerTag", "true"); props.setProperty("useLVMorphoAnalyzerPrev", "true"); props.setProperty("useLVMorphoAnalyzerNext", "true"); props.setProperty("useLVMorphoAnalyzerItemIDs", "true"); props.setProperty("saveFeatureIndexToDisk", "true"); props.setProperty("maxLeft", "1"); props.setProperty("useWord", "true"); //props.setProperty("use2W", "true"); //props.setProperty("usePrevSequences", "true"); //props.setProperty("useClassFeature", "true"); //props.setProperty("useTypeSeqs2", "true"); //props.setProperty("useSequences", "true"); props.setProperty("wordShape", "dan2useLC"); //props.setProperty("useTypeySequences", "true"); //props.setProperty("useDisjunctive", "true"); props.setProperty("noMidNGrams", "true"); props.setProperty("maxNGramLeng", "6"); props.setProperty("useNGrams", "true"); //props.setProperty("usePrev", "true"); //props.setProperty("useNext", "true"); //props.setProperty("useTypeSeqs", "true"); props.setProperty("readerAndWriter", "edu.stanford.nlp.sequences.LVMorphologyReaderAndWriter"); props.setProperty("map", "word=0,answer=1,lemma=2"); AbstractSequenceClassifier<CoreLabel> crf = new CMMClassifier<CoreLabel>(props); DocumentReaderAndWriter reader = crf.makeReaderAndWriter(); if (train) { ObjectBank<List<CoreLabel>> documents = crf.makeObjectBankFromFile(trainfile, reader); crf.train(documents, reader); //atbilstoi props datiem crf.serializeClassifier(classifierOutput); } else { crf = CMMClassifier.getClassifier(pretrainedModel); } testData(crf, testfile, reader); }
From source file:lv.lumii.morphotagger.MorphoPipe.java
License:Open Source License
public static void main(String[] args) throws Exception { for (int i = 0; i < args.length; i++) { if (args[i].equalsIgnoreCase("-tab")) { // one response line per each query line, tab-separated outputType = outputTypes.TAB; token_separator = "\t"; }// w ww. j ava 2s . com if (args[i].equalsIgnoreCase("-rus")) { // ruscorpora.ru format outputType = outputTypes.RUSCORPORA; } if (args[i].equalsIgnoreCase("-vert")) { // one response line per token, tab-separated outputType = outputTypes.VERT; } if (args[i].equalsIgnoreCase("-moses")) { // one response line per token, pipe-separated field_separator = "|"; token_separator = " "; outputType = outputTypes.MOSES; } if (args[i].equalsIgnoreCase("-stripped")) mini_tag = true; //remove nonlexical attributes if (args[i].equalsIgnoreCase("-features")) features = true; //output training features if (args[i].equalsIgnoreCase("-leta")) LETAfeatures = true; //output specific features for LETA semantic frame analysis if (args[i].equalsIgnoreCase("-vertinput")) inputType = inputTypes.VERT; //vertical input format as requested by Milos Jakubicek 2012.11.01 if (args[i].equalsIgnoreCase("-paragraphs")) { inputType = inputTypes.PARAGRAPH; if (i + 1 < args.length && !args[i + 1].startsWith("-")) { try { sentencelengthcap = Integer.parseInt(args[i + 1]); System.err.printf("Sentence length capped to %d\n", sentencelengthcap); i++; } catch (Exception e) { System.err.printf("Error when parsing command line param '%s %s'\n", args[i], args[i + 1]); System.err.println(e.getMessage()); } } } if (args[i].equalsIgnoreCase("-conll-in")) inputType = inputTypes.CONLL; if (args[i].equalsIgnoreCase("-json-in")) inputType = inputTypes.JSON; if (args[i].equalsIgnoreCase("-conll-x")) outputType = outputTypes.CONLL_X; if (args[i].equalsIgnoreCase("-xml")) outputType = outputTypes.XML; if (args[i].equalsIgnoreCase("-visl-cg")) outputType = outputTypes.VISL_CG; if (args[i].equalsIgnoreCase("-lemmatized-text")) outputType = outputTypes.lemmatizedText; if (args[i].equalsIgnoreCase("-saveColumns")) saveColumns = true; //save extra columns from conll input if (args[i].equalsIgnoreCase("-unix-line-endings")) eol = "\n"; if (args[i].equalsIgnoreCase("-keep-tags")) keepTags = true; if (args[i].equalsIgnoreCase("-output-separators")) outputSeparators = true; if (args[i].equalsIgnoreCase("-allow-empty-lines")) stopOnEmpty = false; if (args[i].equalsIgnoreCase("-h") || args[i].equalsIgnoreCase("--help") || args[i].equalsIgnoreCase("-?")) { System.out.println("LV morphological tagger"); System.out.println("\nInput formats"); System.out.println( "\tDefault : plain text UTF-8, one sentence per line, terminated by a blank line."); System.out.println( "\t-paragraphs [lengthcap]: plain text UTF-8, each line will be split in sentences. In output, paragraph borders are noted by an extra blank line. If lengthcap parameter is provided, then sentence length will be limited to that, instead of the default of " + sentencelengthcap); System.out.println( "\t-vertinput : one line per token, sentences separated by <s></s>. Any XML-style tags are echoed as-is. \n\t\tNB! sentences are retokenized, the number of tokens may be different."); System.out.println( "\t-conll-in : CONLL shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines."); System.out.println( "\t-json-in : one line per sentence, each line contains a single json array of strings-tokens."); System.out.println("\nOutput formats"); System.out.println( "\tDefault : JSON. Each sentence is returned as a list of dicts, each dict contains elements 'Word', 'Tag' and 'Lemma'."); System.out.println( "\t-tab : one response line for each query line; tab-separated lists of word, tag and lemma."); System.out.println( "\t-vert : one response line for each token; tab-separated lists of word, tag and lemma."); System.out.println( "\t-moses : one response line for each token; pipe-separated lists of word, tag and lemma."); System.out.println( "\t-conll-x : CONLL-X shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines."); System.out.println("\t-xml : one xml word per line"); System.out.println("\t-visl-cg : output format for VISL constraint grammar tool"); System.out.println( "\t-lemmatized-text : output lowercase lemmatized text, each sentence in new row, tokens seperated by single space"); System.out.println("\nOther options:"); System.out.println( "\t-stripped : lexical/nonessential parts of the tag are replaced with '-' to reduce sparsity."); System.out.println( "\t-features : in conll output, include the features that were used for training/tagging."); System.out.println( "\t-leta : in conll output, include extra features used for semantic frame analysis."); System.out.println("\t-saveColumns : save extra columns from conll input."); System.out .println("\t-unix-line-endings : use \\n line endings for output even on windows systems"); System.out .println("\t-keep-tags : preserve lines that start with '<' to enable xml-style metadata"); System.out .println("\t-output-separators : put <s></s> sentence markup and <p></p> paragraph markup"); System.out.println("\t-allow-empty-lines : do not quit on blank lines input (as per default)"); System.out.flush(); System.exit(0); } } CMMClassifier<CoreLabel> morphoClassifier = CMMClassifier.getClassifier(morphoClassifierLocation); PrintStream out = new PrintStream(System.out, true, "UTF8"); BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF8")); switch (inputType) { case CONLL: for (List<CoreLabel> sentence : readCONLL(in)) { outputSentence(morphoClassifier, out, sentence); } break; default: String s; String sentence = ""; while ((s = in.readLine()) != null && (s.length() != 0 || !stopOnEmpty)) { if (s.startsWith("<") && s.length() > 1 && keepTags) { if (outputType != outputTypes.lemmatizedText) out.println(s); continue; } if (s.length() == 0) continue; boolean finished = true; // is sentence finished and ready to analyze if (inputType != inputTypes.VERT) { sentence = s; } else { if (s.startsWith("<") && s.length() > 1) out.println(s); else sentence = sentence + " " + s; finished = s.startsWith("</s>"); } if (finished) { processSentences(morphoClassifier, out, sentence.trim()); sentence = ""; } } if (inputType != inputTypes.VERT && sentence.length() > 0) { //FIXME, not DRY processSentences(morphoClassifier, out, sentence.trim()); } } in.close(); out.close(); }
From source file:lv.lumii.morphotagger.MorphoPipeLETAArticle.java
License:Open Source License
public static void getParagrapgInSentences(String inputTypePar, String outputTypePar, String otherOptionPar, String fileNameOut, String fileNameIn) throws ClassCastException, ClassNotFoundException, IOException { if (inputTypePar.equalsIgnoreCase("-paragraphs")) inputType = inputTypes.PARAGRAPH; CMMClassifier<CoreLabel> morphoClassifier = CMMClassifier.getClassifier(morphoClassifierLocation); //PrintStream out = new PrintStream(System.out, true, "UTF8"); PrintStream out = new PrintStream(new BufferedOutputStream(new FileOutputStream(fileNameOut, true)), true, "UTF8"); BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(fileNameIn), "UTF8"));//System.in String sentence = ""; HashSet<String> sentencesSet = new HashSet<>(); while ((sentence = in.readLine()) != null) { if (sentence.length() != 0) { splitInSentences(sentencesSet, sentence.trim()); sentence = ""; }/* ww w.java 2 s .co m*/ } if (sentencesSet.size() > 0) { Iterator<String> iterator = sentencesSet.iterator(); while (iterator.hasNext()) { out.println(iterator.next()); } } in.close(); out.close(); }
From source file:lv.pipe.MorphoTagger.java
License:Open Source License
public void init(Properties prop) { try {// www .ja v a2 s . co m morphoClassifier = CMMClassifier.getClassifier(prop.getProperty("morpho.classifierPath", "")); } catch (ClassCastException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
From source file:lv.semti.morphology.webservice.MorphoServer.java
License:Open Source License
public static void main(String[] args) throws Exception { for (int i = 0; i < args.length; i++) { if (args[i].equalsIgnoreCase("-transliterator")) { enableTransliterator = true; System.out.println("Transliteration services enabled"); }//from w ww . jav a 2 s .co m if (args[i].equalsIgnoreCase("-port")) { if (i + 1 < args.length && !args[i + 1].startsWith("-")) { try { port = Integer.parseInt(args[i + 1]); i++; } catch (Exception e) { System.err.printf("Error when parsing command line parameter '%s %s'\n", args[i], args[i + 1]); System.err.println(e.getMessage()); System.exit(64); //EX_USAGE flag according to sysexit.h 'standard' } } } if (args[i].equalsIgnoreCase("-h") || args[i].equalsIgnoreCase("--help") || args[i].equalsIgnoreCase("-?")) { System.out.println("Webservice for LV morphological analysis&inflection, and morphological tagger"); System.out.println("\nCommand line options:"); System.out.println( "\t-transliterator : enable webservice for historical text transliteration (NB! the extra dictionary files and language models need to be included)"); System.out.println( "\t-port 1234 : sets the web server port to some other number than the default 8182"); System.out.println("\nWebservice access:"); System.out.println( "http://localhost:8182/analyze/[word] : morphological analysis of the word (guessing of out-of-vocabulary words disabled by default)"); System.out.println( "http://localhost:8182/tokenize/[query] or POST to http://localhost:8182/tokenize : tokenization of sentences"); System.out.println( "http://localhost:8182/verbs/[query] and http://localhost:8182/neverbs/[query] : Support webservice for 'verbs' valency annotation tool - possible inflections of wordform"); System.out.println( "http://localhost:8182/normalize/[ruleset]/[word] and http://localhost:8182/explain/[query] : (if enabled) historical word transliteration and dictionary explanations"); System.out.println( "http://localhost:8182/inflect/json/[query] : generate all inflectional forms of a lemma"); System.out.println( "http://localhost:8182/inflect_people/json/[query]?gender=[m/f] : generate all inflectional forms of words, assuming that they are person names"); System.out.println( "http://localhost:8182/inflect_phrase/[phrase]?category=[person/org/loc] : try to inflect a multiword expression / named entity, given its category"); System.out.println( "http://localhost:8182/morphotagger/[query] : do statistical morphological disambiguation of a sentence"); System.out.flush(); System.exit(0); } } analyzer = new Analyzer("dist/Lexicon.xml", false); analyzer.setCacheSize(1000); tagset = TagSet.getTagSet(); //NERclassifier = CRFClassifier.getClassifierNoExceptions("dist/models/lv-ner-model.ser.gz"); //NERclassifier.flags.props.setProperty("gazette", "./Gazetteer/LV_LOC_GAZETTEER.txt,./Gazetteer/LV_PERS_GAZETTEER.txt,./Gazetteer/PP_Onomastica_surnames.txt,./Gazetteer/PP_Onomastica_geonames.txt,./Gazetteer/PP_valstis.txt,./Gazetteer/PP_orgnames.txt,./Gazetteer/PP_org_elements.txt"); //NERclassifier.featureFactory.init(NERclassifier.flags); LVMorphologyReaderAndWriter.setPreloadedAnalyzer(analyzer); // Lai nel?dtu vlreiz lieki String morphoClassifierLocation = "dist/models/lv-morpho-model.ser.gz"; morphoClassifier = CMMClassifier.getClassifier(morphoClassifierLocation); Expression.setClassifier(morphoClassifier); // Create a new Restlet component and add a HTTP server connector to it Component component = new Component(); component.getServers().add(Protocol.HTTP, port); // Then attach it to the local host component.getDefaultHost().attach("/analyze/{word}", WordResource.class); component.getDefaultHost().attach("/analyze/{language}/{word}", WordResource.class); component.getDefaultHost().attach("/tokenize/{query}", TokenResource.class); component.getDefaultHost().attach("/tokenize", TokenResource.class); component.getDefaultHost().attach("/verbi/{query}", VerbResource.class); //obsolete, jaaiznjem component.getDefaultHost().attach("/verbs/{query}", VerbResource.class); component.getDefaultHost().attach("/neverbs/{query}", NonVerbResource.class); if (enableTransliterator) { Transliterator.PATH_FILE = "dist/path.conf"; translit = Transliterator.getTransliterator(analyzer); component.getDefaultHost().attach("/explain/{word}", DictionaryResource.class); component.getDefaultHost().attach("/normalize/{ruleset}/{word}", TransliterationResource.class); } component.getDefaultHost().attach("/inflect/{format}/{query}", InflectResource.class); component.getDefaultHost().attach("/inflect/{format}/{language}/{query}", InflectResource.class); component.getDefaultHost().attach("/inflect_people/{format}/{query}", InflectPeopleResource.class); component.getDefaultHost().attach("/inflect_phrase/{phrase}", InflectPhraseResource.class); component.getDefaultHost().attach("/normalize_phrase/{phrase}", NormalizePhraseResource.class); component.getDefaultHost().attach("/nertagger/{query}", NERTaggerResource.class); component.getDefaultHost().attach("/morphotagger/{query}", MorphoTaggerResource.class); component.getDefaultHost().attach("/phonetic_transcriber/{phrase}", PhoneticTranscriberResource.class); // Now, let's start the component! // Note that the HTTP server connector is also automatically started. component.start(); System.out.println( "Usage sample for entity inflection:\nhttp://localhost:8182/inflect_phrase/Vaira Ve-Freiberga?category=person"); }