List of usage examples for org.deeplearning4j.text.sentenceiterator LineSentenceIterator LineSentenceIterator
public LineSentenceIterator(File f)
From source file:DL4J.java
License:Open Source License
public void run() throws IOException { log.info("Load data..."); // ClassPathResource resource = new ClassPathResource("raw_sentences.txt"); // SentenceIterator iter = new LineSentenceIterator(resource.getFile()); SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt")); iter.setPreProcessor((SentencePreProcessor) String::toLowerCase); log.info("Tokenize data..."); final EndingPreProcessor preProcessor = new EndingPreProcessor(); TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(token -> { token = token.toLowerCase();//ww w .j ava 2 s. co m String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) System.out.println(); return base; }); log.info("Build model..."); int batchSize = 1000; int iterations = 3; int layerSize = 300; Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch. .minWordFrequency(5) // .useAdaGrad(false) // .layerSize(layerSize) // word feature vector size .iterations(iterations) // # iterations to train .learningRate(0.025) // .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning .negativeSample(10) // sample size 10 words .iterate(iter) // .tokenizerFactory(tokenizerFactory) // .build(); vec.fit(); log.info("evaluate model..."); double sim = vec.similarity("people", "money"); log.info("Similarity between peeple and money: " + sim); Collection<String> similar = vec.wordsNearest("human", 10); log.info("Similarity words to 'day' : " + similar); // log.info("Plot TSNE"); // BarnesHutTsne tsne = new BarnesHutTsne.Builder() // .setMaxIter(1000) // .stopLyingIteration(250) // .learningRate(500) // .useAdaGrad(false) // .theta(0.5) // .setMomentum(0.5) // .normalize(true) // .usePca(false) // .build(); // vec.lookupTable().plotVocab(tsne); log.info("Save vectors...."); WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt"); }