List of usage examples for org.deeplearning4j.models.embeddings.loader WordVectorSerializer writeWordVectors
public static void writeWordVectors(@NonNull FastText vectors, @NonNull File path) throws IOException
From source file:DL4J.java
License:Open Source License
public void run() throws IOException { log.info("Load data..."); // ClassPathResource resource = new ClassPathResource("raw_sentences.txt"); // SentenceIterator iter = new LineSentenceIterator(resource.getFile()); SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt")); iter.setPreProcessor((SentencePreProcessor) String::toLowerCase); log.info("Tokenize data..."); final EndingPreProcessor preProcessor = new EndingPreProcessor(); TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(token -> { token = token.toLowerCase();/*from w w w. jav a 2 s. c om*/ String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) System.out.println(); return base; }); log.info("Build model..."); int batchSize = 1000; int iterations = 3; int layerSize = 300; Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch. .minWordFrequency(5) // .useAdaGrad(false) // .layerSize(layerSize) // word feature vector size .iterations(iterations) // # iterations to train .learningRate(0.025) // .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning .negativeSample(10) // sample size 10 words .iterate(iter) // .tokenizerFactory(tokenizerFactory) // .build(); vec.fit(); log.info("evaluate model..."); double sim = vec.similarity("people", "money"); log.info("Similarity between peeple and money: " + sim); Collection<String> similar = vec.wordsNearest("human", 10); log.info("Similarity words to 'day' : " + similar); // log.info("Plot TSNE"); // BarnesHutTsne tsne = new BarnesHutTsne.Builder() // .setMaxIter(1000) // .stopLyingIteration(250) // .learningRate(500) // .useAdaGrad(false) // .theta(0.5) // .setMomentum(0.5) // .normalize(true) // .usePca(false) // .build(); // vec.lookupTable().plotVocab(tsne); log.info("Save vectors...."); WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt"); }
From source file:doc2vec.LuceneDocIterator.java
public void processAll() throws Exception { System.out.println("Learning doc embeddings"); /* Call this to train docvec on a file (each line a sentence) String docFileName = prop.getProperty("docvec.in.file"); learnDocEmbeddings(docFileName);//from w ww . j a v a2 s . com */ /* Call this to train doc2vec on the Lucene index.. */ String indexPath = prop.getProperty("index"); learnDocEmbeddings(new File(indexPath)); String outDocVecFile = prop.getProperty("dvec.out.file"); BufferedWriter bw = new BufferedWriter(new FileWriter(outDocVecFile)); System.out.println("Writing out the doc vectors for indexing..."); WordVectorSerializer.writeWordVectors(vec, bw); bw.close(); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("poem.txt"); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); /*/*from w ww . j a va 2 s .c om*/ CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt"); String[] testwords = new String[] { "", "", "", "", "" }; for (String s : testwords) { Collection<String> lst = vec.wordsNearest(s, 5); System.out.println(s + " => " + lst); } Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5); System.out.println(kingList); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("gov-annc.txt"); iter.setPreProcessor(new SentencePreProcessor() { @Override// w ww . j a v a 2s.co m public String preProcess(String sentence) { return sentence.toLowerCase(); } }); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); final EndingPreProcessor preProcessor = new EndingPreProcessor(); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { token = token.toLowerCase(); String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) { System.out.println(); } return base; } }); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt"); Collection<String> lst = vec.wordsNearest("information", 10); log.info("on 1st run: " + lst); System.out.println(vec.similarity("information", "data")); System.out.println(vec.similarity("information", "magic")); }
From source file:uk.bl.wa.nlp.wordvec.WordvecProcessor.java
License:Open Source License
public static void main(String[] args) throws Exception { SentenceIterator iter = new StanfordSentenceIterator(new FileReader("src/test/resources/Mona_Lisa.txt")); // Use Stanford NLP sentence splitter: // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit();/* w w w . j a v a2 s . c o m*/ log.info("Writing word vectors to text file...."); // Write word vectors WordVectorSerializer.writeWordVectors(vec, "pathToWriteto.txt"); WordVectorSerializer.writeFullModel(vec, "pathToWriteto.model"); log.info("Closest Words:"); Collection<String> lst = vec.wordsNearest("french", 10); System.out.println(lst); // UiServer server = UiServer.getInstance(); // System.out.println("Started on port " + server.getPort()); }