Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package edu.polyu.comp5412.word2vec; import org.deeplearning4j.models.word2vec.Word2Vec; import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; import org.deeplearning4j.text.sentenceiterator.SentenceIterator; import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory; import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collection; import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; import org.deeplearning4j.text.sentenceiterator.SentencePreProcessor; import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess; import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.EndingPreProcessor; public class Word2VecTrainningEng { private static Logger log = LoggerFactory.getLogger(Word2VecTrainningEng.class); public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("gov-annc.txt"); iter.setPreProcessor(new SentencePreProcessor() { @Override public String preProcess(String sentence) { return sentence.toLowerCase(); } }); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); final EndingPreProcessor preProcessor = new EndingPreProcessor(); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { token = token.toLowerCase(); String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) { System.out.println(); } return base; } }); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt"); Collection<String> lst = vec.wordsNearest("information", 10); log.info("on 1st run: " + lst); System.out.println(vec.similarity("information", "data")); System.out.println(vec.similarity("information", "magic")); } }