List of usage examples for org.deeplearning4j.text.tokenization.tokenizer TokenPreProcess TokenPreProcess
TokenPreProcess
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("gov-annc.txt"); iter.setPreProcessor(new SentencePreProcessor() { @Override//from ww w . j a v a2 s. com public String preProcess(String sentence) { return sentence.toLowerCase(); } }); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); final EndingPreProcessor preProcessor = new EndingPreProcessor(); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { token = token.toLowerCase(); String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) { System.out.println(); } return base; } }); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt"); Collection<String> lst = vec.wordsNearest("information", 10); log.info("on 1st run: " + lst); System.out.println(vec.similarity("information", "data")); System.out.println(vec.similarity("information", "magic")); }
From source file:net.liaocy.ml4j.nlp.word2vec.Train.java
public void train(Collection<String> commaSentences, Language lang, String modelName) throws IOException { System.out.println("Load & Vectorize Sentences...."); SentenceIterator iter = new CollectionSentenceIterator(commaSentences); iter.setPreProcessor(new SentencePreProcessor() { @Override/*from w ww .j a v a2 s . c o m*/ public String preProcess(String sentence) { return sentence; } }); MyTokenizerFactory t = new MyTokenizerFactory(lang); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { return token; } }); System.out.println("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(0).iterations(1).layerSize(200).seed(42) .windowSize(5).learningRate(0.025).iterate(iter).tokenizerFactory(t).build(); System.out.println("Fitting Word2Vec model...."); vec.fit(); System.out.println("Save Model..."); this.saveModel(modelName, vec); }