List of usage examples for org.deeplearning4j.text.tokenization.tokenizer.preprocessor EndingPreProcessor EndingPreProcessor
EndingPreProcessor
From source file:DL4J.java
License:Open Source License
public void run() throws IOException { log.info("Load data..."); // ClassPathResource resource = new ClassPathResource("raw_sentences.txt"); // SentenceIterator iter = new LineSentenceIterator(resource.getFile()); SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt")); iter.setPreProcessor((SentencePreProcessor) String::toLowerCase); log.info("Tokenize data..."); final EndingPreProcessor preProcessor = new EndingPreProcessor(); TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(token -> { token = token.toLowerCase();//from w w w . j a va2 s. com String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) System.out.println(); return base; }); log.info("Build model..."); int batchSize = 1000; int iterations = 3; int layerSize = 300; Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch. .minWordFrequency(5) // .useAdaGrad(false) // .layerSize(layerSize) // word feature vector size .iterations(iterations) // # iterations to train .learningRate(0.025) // .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning .negativeSample(10) // sample size 10 words .iterate(iter) // .tokenizerFactory(tokenizerFactory) // .build(); vec.fit(); log.info("evaluate model..."); double sim = vec.similarity("people", "money"); log.info("Similarity between peeple and money: " + sim); Collection<String> similar = vec.wordsNearest("human", 10); log.info("Similarity words to 'day' : " + similar); // log.info("Plot TSNE"); // BarnesHutTsne tsne = new BarnesHutTsne.Builder() // .setMaxIter(1000) // .stopLyingIteration(250) // .learningRate(500) // .useAdaGrad(false) // .theta(0.5) // .setMomentum(0.5) // .normalize(true) // .usePca(false) // .build(); // vec.lookupTable().plotVocab(tsne); log.info("Save vectors...."); WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt"); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("gov-annc.txt"); iter.setPreProcessor(new SentencePreProcessor() { @Override/*from w ww .j a v a2 s . co m*/ public String preProcess(String sentence) { return sentence.toLowerCase(); } }); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); final EndingPreProcessor preProcessor = new EndingPreProcessor(); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { token = token.toLowerCase(); String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) { System.out.println(); } return base; } }); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt"); Collection<String> lst = vec.wordsNearest("information", 10); log.info("on 1st run: " + lst); System.out.println(vec.similarity("information", "data")); System.out.println(vec.similarity("information", "magic")); }