Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package edu.polyu.comp5412.word2vec; import java.io.FileNotFoundException; import java.util.Arrays; import java.util.Collection; import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; import org.deeplearning4j.models.word2vec.Word2Vec; import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; import org.deeplearning4j.text.sentenceiterator.SentenceIterator; import org.deeplearning4j.text.sentenceiterator.SentencePreProcessor; import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess; import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.EndingPreProcessor; import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory; import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author mungchau */ public class Word2VecTrainningChi { private static Logger log = LoggerFactory.getLogger(Word2VecTrainningEng.class); public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("poem.txt"); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt"); String[] testwords = new String[] { "", "", "", "", "" }; for (String s : testwords) { Collection<String> lst = vec.wordsNearest(s, 5); System.out.println(s + " => " + lst); } Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5); System.out.println(kingList); } }