List of usage examples for org.deeplearning4j.text.sentenceiterator BasicLineIterator BasicLineIterator
public BasicLineIterator(@NonNull String filePath) throws FileNotFoundException
From source file:doc2vec.LuceneDocIterator.java
void learnDocEmbeddings(String docFile) throws Exception { SentenceIterator iter = new BasicLineIterator(docFile); InMemoryLookupCache cache = new InMemoryLookupCache(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); LabelsSource source = new LabelsSource("DOCNO_"); vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5) .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter) .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build(); vec.fit();//ww w . j ava 2 s. c om }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("poem.txt"); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); /*//from w w w. j a v a 2 s . co m CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt"); String[] testwords = new String[] { "", "", "", "", "" }; for (String s : testwords) { Collection<String> lst = vec.wordsNearest(s, 5); System.out.println(s + " => " + lst); } Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5); System.out.println(kingList); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("gov-annc.txt"); iter.setPreProcessor(new SentencePreProcessor() { @Override//from ww w .j a v a 2 s .co m public String preProcess(String sentence) { return sentence.toLowerCase(); } }); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); final EndingPreProcessor preProcessor = new EndingPreProcessor(); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { token = token.toLowerCase(); String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) { System.out.println(); } return base; } }); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt"); Collection<String> lst = vec.wordsNearest("information", 10); log.info("on 1st run: " + lst); System.out.println(vec.similarity("information", "data")); System.out.println(vec.similarity("information", "magic")); }